From 68569dee1416593955c1570d638b3d9250b33012 Mon Sep 17 00:00:00 2001 From: trav90 Date: Mon, 15 Oct 2018 21:45:30 -0500 Subject: Import aom library This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36. --- third_party/aom/aom_dsp/add_noise.c | 73 + third_party/aom/aom_dsp/ans.h | 44 + third_party/aom/aom_dsp/ansreader.h | 214 ++ third_party/aom/aom_dsp/answriter.h | 148 + third_party/aom/aom_dsp/aom_convolve.c | 854 +++++ third_party/aom/aom_dsp/aom_convolve.h | 57 + third_party/aom/aom_dsp/aom_dsp.cmake | 509 +++ third_party/aom/aom_dsp/aom_dsp.mk | 428 +++ third_party/aom/aom_dsp/aom_dsp_common.h | 107 + third_party/aom/aom_dsp/aom_dsp_rtcd.c | 16 + third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 1495 ++++++++ third_party/aom/aom_dsp/aom_filter.h | 43 + third_party/aom/aom_dsp/aom_simd.h | 37 + third_party/aom/aom_dsp/aom_simd_inline.h | 21 + .../aom/aom_dsp/arm/aom_convolve8_avg_neon.c | 364 ++ .../aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm | 295 ++ third_party/aom/aom_dsp/arm/aom_convolve8_neon.c | 331 ++ .../aom/aom_dsp/arm/aom_convolve8_neon_asm.asm | 273 ++ .../aom/aom_dsp/arm/aom_convolve_avg_neon.c | 145 + .../aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm | 119 + .../aom/aom_dsp/arm/aom_convolve_copy_neon.c | 93 + .../aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm | 87 + third_party/aom/aom_dsp/arm/aom_convolve_neon.c | 66 + third_party/aom/aom_dsp/arm/avg_neon.c | 254 ++ .../aom/aom_dsp/arm/bilinear_filter_media.asm | 240 ++ third_party/aom/aom_dsp/arm/fwd_txfm_neon.c | 221 ++ third_party/aom/aom_dsp/arm/hadamard_neon.c | 200 ++ .../aom/aom_dsp/arm/idct16x16_1_add_neon.asm | 201 ++ third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c | 59 + third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm | 1182 +++++++ third_party/aom/aom_dsp/arm/idct16x16_add_neon.c | 1295 +++++++ third_party/aom/aom_dsp/arm/idct16x16_neon.c | 152 + .../aom/aom_dsp/arm/idct32x32_1_add_neon.asm | 147 + third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c | 141 + third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm | 1302 +++++++ third_party/aom/aom_dsp/arm/idct32x32_add_neon.c | 686 ++++ third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm | 71 + third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c | 47 + third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm | 193 ++ third_party/aom/aom_dsp/arm/idct4x4_add_neon.c | 146 + third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm | 91 + third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c | 62 + third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm | 522 +++ third_party/aom/aom_dsp/arm/idct8x8_add_neon.c | 509 +++ third_party/aom/aom_dsp/arm/intrapred_neon.c | 757 ++++ third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm | 633 ++++ third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm | 202 ++ third_party/aom/aom_dsp/arm/loopfilter_16_neon.c | 174 + third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm | 252 ++ third_party/aom/aom_dsp/arm/loopfilter_4_neon.c | 250 ++ third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm | 428 +++ third_party/aom/aom_dsp/arm/loopfilter_8_neon.c | 430 +++ third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm | 638 ++++ third_party/aom/aom_dsp/arm/loopfilter_neon.c | 49 + third_party/aom/aom_dsp/arm/sad4d_neon.c | 225 ++ third_party/aom/aom_dsp/arm/sad_media.asm | 98 + third_party/aom/aom_dsp/arm/sad_neon.c | 224 ++ third_party/aom/aom_dsp/arm/save_reg_neon.asm | 39 + .../aom/aom_dsp/arm/subpel_variance_media.c | 81 + third_party/aom/aom_dsp/arm/subpel_variance_neon.c | 134 + third_party/aom/aom_dsp/arm/subtract_neon.c | 80 + .../arm/variance_halfpixvar16x16_h_media.asm | 185 + .../arm/variance_halfpixvar16x16_hv_media.asm | 225 ++ .../arm/variance_halfpixvar16x16_v_media.asm | 187 + third_party/aom/aom_dsp/arm/variance_media.asm | 361 ++ third_party/aom/aom_dsp/arm/variance_neon.c | 400 +++ third_party/aom/aom_dsp/avg.c | 232 ++ third_party/aom/aom_dsp/binary_codes_reader.c | 117 + third_party/aom/aom_dsp/binary_codes_reader.h | 38 + third_party/aom/aom_dsp/binary_codes_writer.c | 211 ++ third_party/aom/aom_dsp/binary_codes_writer.h | 70 + third_party/aom/aom_dsp/bitreader.h | 276 ++ third_party/aom/aom_dsp/bitreader_buffer.c | 47 + third_party/aom/aom_dsp/bitreader_buffer.h | 48 + third_party/aom/aom_dsp/bitwriter.h | 255 ++ third_party/aom/aom_dsp/bitwriter_buffer.c | 61 + third_party/aom/aom_dsp/bitwriter_buffer.h | 44 + third_party/aom/aom_dsp/blend.h | 42 + third_party/aom/aom_dsp/blend_a64_hmask.c | 71 + third_party/aom/aom_dsp/blend_a64_mask.c | 145 + third_party/aom/aom_dsp/blend_a64_vmask.c | 73 + third_party/aom/aom_dsp/buf_ans.c | 71 + third_party/aom/aom_dsp/buf_ans.h | 133 + third_party/aom/aom_dsp/daalaboolreader.c | 37 + third_party/aom/aom_dsp/daalaboolreader.h | 164 + third_party/aom/aom_dsp/daalaboolwriter.c | 32 + third_party/aom/aom_dsp/daalaboolwriter.h | 87 + third_party/aom/aom_dsp/dkboolreader.c | 110 + third_party/aom/aom_dsp/dkboolreader.h | 181 + third_party/aom/aom_dsp/dkboolwriter.c | 44 + third_party/aom/aom_dsp/dkboolwriter.h | 104 + third_party/aom/aom_dsp/entcode.c | 53 + third_party/aom/aom_dsp/entcode.h | 46 + third_party/aom/aom_dsp/entdec.c | 300 ++ third_party/aom/aom_dsp/entdec.h | 91 + third_party/aom/aom_dsp/entenc.c | 507 +++ third_party/aom/aom_dsp/entenc.h | 91 + third_party/aom/aom_dsp/fastssim.c | 493 +++ third_party/aom/aom_dsp/fwd_txfm.c | 809 +++++ third_party/aom/aom_dsp/fwd_txfm.h | 29 + third_party/aom/aom_dsp/intrapred.c | 971 ++++++ third_party/aom/aom_dsp/inv_txfm.c | 1445 ++++++++ third_party/aom/aom_dsp/inv_txfm.h | 91 + third_party/aom/aom_dsp/loopfilter.c | 900 +++++ third_party/aom/aom_dsp/mips/add_noise_msa.c | 60 + .../aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c | 704 ++++ .../aom/aom_dsp/mips/aom_convolve8_avg_msa.c | 605 ++++ .../aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c | 677 ++++ .../aom/aom_dsp/mips/aom_convolve8_horiz_msa.c | 692 ++++ third_party/aom/aom_dsp/mips/aom_convolve8_msa.c | 630 ++++ .../aom/aom_dsp/mips/aom_convolve8_vert_msa.c | 699 ++++ .../aom/aom_dsp/mips/aom_convolve_avg_msa.c | 233 ++ .../aom/aom_dsp/mips/aom_convolve_copy_msa.c | 248 ++ third_party/aom/aom_dsp/mips/aom_convolve_msa.h | 124 + third_party/aom/aom_dsp/mips/avg_msa.c | 57 + third_party/aom/aom_dsp/mips/common_dspr2.c | 31 + third_party/aom/aom_dsp/mips/common_dspr2.h | 49 + third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c | 256 ++ .../aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c | 802 +++++ third_party/aom/aom_dsp/mips/convolve2_dspr2.c | 1030 ++++++ .../aom/aom_dsp/mips/convolve2_horiz_dspr2.c | 681 ++++ .../aom/aom_dsp/mips/convolve2_vert_dspr2.c | 237 ++ third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c | 641 ++++ .../aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c | 998 ++++++ third_party/aom/aom_dsp/mips/convolve8_dspr2.c | 1590 +++++++++ .../aom/aom_dsp/mips/convolve8_horiz_dspr2.c | 878 +++++ .../aom/aom_dsp/mips/convolve8_vert_dspr2.c | 360 ++ .../aom/aom_dsp/mips/convolve_common_dspr2.h | 59 + third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c | 948 +++++ third_party/aom/aom_dsp/mips/fwd_txfm_msa.c | 246 ++ third_party/aom/aom_dsp/mips/fwd_txfm_msa.h | 381 ++ third_party/aom/aom_dsp/mips/idct16x16_msa.c | 486 +++ third_party/aom/aom_dsp/mips/idct32x32_msa.c | 730 ++++ third_party/aom/aom_dsp/mips/idct4x4_msa.c | 99 + third_party/aom/aom_dsp/mips/idct8x8_msa.c | 117 + third_party/aom/aom_dsp/mips/intrapred16_dspr2.c | 325 ++ third_party/aom/aom_dsp/mips/intrapred4_dspr2.c | 225 ++ third_party/aom/aom_dsp/mips/intrapred8_dspr2.c | 603 ++++ third_party/aom/aom_dsp/mips/intrapred_msa.c | 739 ++++ third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h | 80 + third_party/aom/aom_dsp/mips/inv_txfm_msa.h | 412 +++ third_party/aom/aom_dsp/mips/itrans16_dspr2.c | 1190 +++++++ third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c | 1042 ++++++ third_party/aom/aom_dsp/mips/itrans32_dspr2.c | 1030 ++++++ third_party/aom/aom_dsp/mips/itrans4_dspr2.c | 342 ++ third_party/aom/aom_dsp/mips/itrans8_dspr2.c | 645 ++++ third_party/aom/aom_dsp/mips/loopfilter_16_msa.c | 1487 ++++++++ third_party/aom/aom_dsp/mips/loopfilter_4_msa.c | 147 + third_party/aom/aom_dsp/mips/loopfilter_8_msa.c | 333 ++ .../aom/aom_dsp/mips/loopfilter_filters_dspr2.c | 327 ++ .../aom/aom_dsp/mips/loopfilter_filters_dspr2.h | 735 ++++ .../aom/aom_dsp/mips/loopfilter_macros_dspr2.h | 436 +++ .../aom/aom_dsp/mips/loopfilter_masks_dspr2.h | 356 ++ third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c | 589 ++++ .../aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c | 734 ++++ .../aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c | 757 ++++ third_party/aom/aom_dsp/mips/loopfilter_msa.h | 251 ++ third_party/aom/aom_dsp/mips/macros_msa.h | 2057 +++++++++++ third_party/aom/aom_dsp/mips/sad_msa.c | 1529 +++++++++ .../aom/aom_dsp/mips/sub_pixel_variance_msa.c | 1795 ++++++++++ third_party/aom/aom_dsp/mips/subtract_msa.c | 265 ++ third_party/aom/aom_dsp/mips/txfm_macros_msa.h | 97 + third_party/aom/aom_dsp/mips/variance_msa.c | 632 ++++ third_party/aom/aom_dsp/postproc.h | 26 + third_party/aom/aom_dsp/prob.c | 236 ++ third_party/aom/aom_dsp/prob.h | 198 ++ third_party/aom/aom_dsp/psnr.c | 373 ++ third_party/aom/aom_dsp/psnr.h | 79 + third_party/aom/aom_dsp/psnrhvs.c | 276 ++ third_party/aom/aom_dsp/quantize.c | 832 +++++ third_party/aom/aom_dsp/quantize.h | 120 + third_party/aom/aom_dsp/sad.c | 512 +++ third_party/aom/aom_dsp/simd/v128_intrinsics.h | 268 ++ third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h | 671 ++++ third_party/aom/aom_dsp/simd/v128_intrinsics_c.h | 707 ++++ third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h | 511 +++ third_party/aom/aom_dsp/simd/v256_intrinsics.h | 283 ++ third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h | 17 + third_party/aom/aom_dsp/simd/v256_intrinsics_c.h | 724 ++++ .../aom/aom_dsp/simd/v256_intrinsics_v128.h | 545 +++ third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h | 548 +++ third_party/aom/aom_dsp/simd/v64_intrinsics.h | 223 ++ third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h | 583 ++++ third_party/aom/aom_dsp/simd/v64_intrinsics_c.h | 919 +++++ third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h | 470 +++ third_party/aom/aom_dsp/ssim.c | 462 +++ third_party/aom/aom_dsp/ssim.h | 88 + third_party/aom/aom_dsp/subtract.c | 55 + third_party/aom/aom_dsp/sum_squares.c | 40 + third_party/aom/aom_dsp/txfm_common.h | 70 + third_party/aom/aom_dsp/variance.c | 1249 +++++++ third_party/aom/aom_dsp/variance.h | 132 + third_party/aom/aom_dsp/x86/aom_asm_stubs.c | 182 + .../aom/aom_dsp/x86/aom_convolve_copy_sse2.asm | 345 ++ .../aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm | 965 ++++++ .../x86/aom_high_subpixel_bilinear_sse2.asm | 497 +++ .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 575 ++++ .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c | 920 +++++ .../aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm | 990 ++++++ .../aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm | 883 +++++ .../aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm | 451 +++ .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm | 421 +++ third_party/aom/aom_dsp/x86/avg_intrin_sse2.c | 426 +++ third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm | 124 + third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c | 36 + third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c | 924 +++++ third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c | 285 ++ third_party/aom/aom_dsp/x86/blend_sse4.h | 146 + third_party/aom/aom_dsp/x86/convolve.h | 288 ++ third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c | 862 +++++ .../aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h | 3022 ++++++++++++++++ .../aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h | 3201 +++++++++++++++++ third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c | 24 + third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h | 35 + third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h | 1014 ++++++ third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c | 273 ++ third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h | 362 ++ .../aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 204 ++ .../aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm | 349 ++ .../aom/aom_dsp/x86/halfpix_variance_sse2.c | 77 + third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c | 1151 +++++++ .../aom/aom_dsp/x86/highbd_intrapred_sse2.asm | 456 +++ .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c | 1140 ++++++ .../aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c | 155 + third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm | 290 ++ third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm | 366 ++ .../x86/highbd_subpel_variance_impl_sse2.asm | 1040 ++++++ third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c | 364 ++ .../aom/aom_dsp/x86/highbd_variance_impl_sse2.asm | 316 ++ third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 695 ++++ third_party/aom/aom_dsp/x86/highbd_variance_sse4.c | 216 ++ third_party/aom/aom_dsp/x86/intrapred_sse2.asm | 771 +++++ third_party/aom/aom_dsp/x86/intrapred_ssse3.asm | 410 +++ third_party/aom/aom_dsp/x86/inv_txfm_sse2.c | 3631 ++++++++++++++++++++ third_party/aom/aom_dsp/x86/inv_txfm_sse2.h | 265 ++ third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c | 1333 +++++++ third_party/aom/aom_dsp/x86/inv_wht_sse2.asm | 112 + third_party/aom/aom_dsp/x86/loopfilter_avx2.c | 915 +++++ third_party/aom/aom_dsp/x86/loopfilter_sse2.c | 1892 ++++++++++ .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 334 ++ .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 1948 +++++++++++ third_party/aom/aom_dsp/x86/obmc_sad_sse4.c | 262 ++ third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 355 ++ .../aom/aom_dsp/x86/quantize_avx_x86_64.asm | 547 +++ third_party/aom/aom_dsp/x86/quantize_sse2.c | 249 ++ .../aom/aom_dsp/x86/quantize_ssse3_x86_64.asm | 349 ++ third_party/aom/aom_dsp/x86/sad4d_avx2.c | 216 ++ third_party/aom/aom_dsp/x86/sad4d_sse2.asm | 253 ++ third_party/aom/aom_dsp/x86/sad_avx2.c | 187 + third_party/aom/aom_dsp/x86/sad_highbd_avx2.c | 1043 ++++++ third_party/aom/aom_dsp/x86/sad_impl_avx2.c | 233 ++ third_party/aom/aom_dsp/x86/sad_sse2.asm | 345 ++ third_party/aom/aom_dsp/x86/sad_sse3.asm | 377 ++ third_party/aom/aom_dsp/x86/sad_sse4.asm | 362 ++ third_party/aom/aom_dsp/x86/sad_ssse3.asm | 373 ++ third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm | 219 ++ .../aom/aom_dsp/x86/subpel_variance_sse2.asm | 1489 ++++++++ third_party/aom/aom_dsp/x86/subtract_sse2.asm | 150 + third_party/aom/aom_dsp/x86/sum_squares_sse2.c | 210 ++ third_party/aom/aom_dsp/x86/synonyms.h | 120 + third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 204 ++ third_party/aom/aom_dsp/x86/txfm_common_intrin.h | 31 + third_party/aom/aom_dsp/x86/txfm_common_sse2.h | 326 ++ third_party/aom/aom_dsp/x86/variance_avx2.c | 192 ++ third_party/aom/aom_dsp/x86/variance_impl_avx2.c | 713 ++++ third_party/aom/aom_dsp/x86/variance_sse2.c | 690 ++++ 266 files changed, 119012 insertions(+) create mode 100644 third_party/aom/aom_dsp/add_noise.c create mode 100644 third_party/aom/aom_dsp/ans.h create mode 100644 third_party/aom/aom_dsp/ansreader.h create mode 100644 third_party/aom/aom_dsp/answriter.h create mode 100644 third_party/aom/aom_dsp/aom_convolve.c create mode 100644 third_party/aom/aom_dsp/aom_convolve.h create mode 100644 third_party/aom/aom_dsp/aom_dsp.cmake create mode 100644 third_party/aom/aom_dsp/aom_dsp.mk create mode 100644 third_party/aom/aom_dsp/aom_dsp_common.h create mode 100644 third_party/aom/aom_dsp/aom_dsp_rtcd.c create mode 100755 third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl create mode 100644 third_party/aom/aom_dsp/aom_filter.h create mode 100644 third_party/aom/aom_dsp/aom_simd.h create mode 100644 third_party/aom/aom_dsp/aom_simd_inline.h create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_neon.c create mode 100644 third_party/aom/aom_dsp/arm/avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/bilinear_filter_media.asm create mode 100644 third_party/aom/aom_dsp/arm/fwd_txfm_neon.c create mode 100644 third_party/aom/aom_dsp/arm/hadamard_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/intrapred_neon.c create mode 100644 third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon.c create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon.c create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon.c create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sad4d_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sad_media.asm create mode 100644 third_party/aom/aom_dsp/arm/sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/save_reg_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/subpel_variance_media.c create mode 100644 third_party/aom/aom_dsp/arm/subpel_variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/subtract_neon.c create mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_neon.c create mode 100644 third_party/aom/aom_dsp/avg.c create mode 100644 third_party/aom/aom_dsp/binary_codes_reader.c create mode 100644 third_party/aom/aom_dsp/binary_codes_reader.h create mode 100644 third_party/aom/aom_dsp/binary_codes_writer.c create mode 100644 third_party/aom/aom_dsp/binary_codes_writer.h create mode 100644 third_party/aom/aom_dsp/bitreader.h create mode 100644 third_party/aom/aom_dsp/bitreader_buffer.c create mode 100644 third_party/aom/aom_dsp/bitreader_buffer.h create mode 100644 third_party/aom/aom_dsp/bitwriter.h create mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.c create mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.h create mode 100644 third_party/aom/aom_dsp/blend.h create mode 100644 third_party/aom/aom_dsp/blend_a64_hmask.c create mode 100644 third_party/aom/aom_dsp/blend_a64_mask.c create mode 100644 third_party/aom/aom_dsp/blend_a64_vmask.c create mode 100644 third_party/aom/aom_dsp/buf_ans.c create mode 100644 third_party/aom/aom_dsp/buf_ans.h create mode 100644 third_party/aom/aom_dsp/daalaboolreader.c create mode 100644 third_party/aom/aom_dsp/daalaboolreader.h create mode 100644 third_party/aom/aom_dsp/daalaboolwriter.c create mode 100644 third_party/aom/aom_dsp/daalaboolwriter.h create mode 100644 third_party/aom/aom_dsp/dkboolreader.c create mode 100644 third_party/aom/aom_dsp/dkboolreader.h create mode 100644 third_party/aom/aom_dsp/dkboolwriter.c create mode 100644 third_party/aom/aom_dsp/dkboolwriter.h create mode 100644 third_party/aom/aom_dsp/entcode.c create mode 100644 third_party/aom/aom_dsp/entcode.h create mode 100644 third_party/aom/aom_dsp/entdec.c create mode 100644 third_party/aom/aom_dsp/entdec.h create mode 100644 third_party/aom/aom_dsp/entenc.c create mode 100644 third_party/aom/aom_dsp/entenc.h create mode 100644 third_party/aom/aom_dsp/fastssim.c create mode 100644 third_party/aom/aom_dsp/fwd_txfm.c create mode 100644 third_party/aom/aom_dsp/fwd_txfm.h create mode 100644 third_party/aom/aom_dsp/intrapred.c create mode 100644 third_party/aom/aom_dsp/inv_txfm.c create mode 100644 third_party/aom/aom_dsp/inv_txfm.h create mode 100644 third_party/aom/aom_dsp/loopfilter.c create mode 100644 third_party/aom/aom_dsp/mips/add_noise_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_msa.h create mode 100644 third_party/aom/aom_dsp/mips/avg_msa.c create mode 100644 third_party/aom/aom_dsp/mips/common_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/common_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve_common_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c create mode 100644 third_party/aom/aom_dsp/mips/fwd_txfm_msa.c create mode 100644 third_party/aom/aom_dsp/mips/fwd_txfm_msa.h create mode 100644 third_party/aom/aom_dsp/mips/idct16x16_msa.c create mode 100644 third_party/aom/aom_dsp/mips/idct32x32_msa.c create mode 100644 third_party/aom/aom_dsp/mips/idct4x4_msa.c create mode 100644 third_party/aom/aom_dsp/mips/idct8x8_msa.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred16_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred4_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred8_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred_msa.c create mode 100644 third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/inv_txfm_msa.h create mode 100644 third_party/aom/aom_dsp/mips/itrans16_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans32_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans4_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans8_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_16_msa.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_4_msa.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_8_msa.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_msa.h create mode 100644 third_party/aom/aom_dsp/mips/macros_msa.h create mode 100644 third_party/aom/aom_dsp/mips/sad_msa.c create mode 100644 third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c create mode 100644 third_party/aom/aom_dsp/mips/subtract_msa.c create mode 100644 third_party/aom/aom_dsp/mips/txfm_macros_msa.h create mode 100644 third_party/aom/aom_dsp/mips/variance_msa.c create mode 100644 third_party/aom/aom_dsp/postproc.h create mode 100644 third_party/aom/aom_dsp/prob.c create mode 100644 third_party/aom/aom_dsp/prob.h create mode 100644 third_party/aom/aom_dsp/psnr.c create mode 100644 third_party/aom/aom_dsp/psnr.h create mode 100644 third_party/aom/aom_dsp/psnrhvs.c create mode 100644 third_party/aom/aom_dsp/quantize.c create mode 100644 third_party/aom/aom_dsp/quantize.h create mode 100644 third_party/aom/aom_dsp/sad.c create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/ssim.c create mode 100644 third_party/aom/aom_dsp/ssim.h create mode 100644 third_party/aom/aom_dsp/subtract.c create mode 100644 third_party/aom/aom_dsp/sum_squares.c create mode 100644 third_party/aom/aom_dsp/txfm_common.h create mode 100644 third_party/aom/aom_dsp/variance.c create mode 100644 third_party/aom/aom_dsp/variance.h create mode 100644 third_party/aom/aom_dsp/x86/aom_asm_stubs.c create mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/avg_intrin_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_sse4.h create mode 100644 third_party/aom/aom_dsp/x86/convolve.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/inv_wht_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/loopfilter_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/loopfilter_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/quantize_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/sad4d_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad4d_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_highbd_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_impl_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_sse3.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_sse4.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/subtract_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/synonyms.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_intrin.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/variance_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/variance_impl_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/variance_sse2.c (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c new file mode 100644 index 000000000..389cf2049 --- /dev/null +++ b/third_party/aom/aom_dsp/add_noise.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], + char whiteclamp[16], char bothclamp[16], + unsigned int width, unsigned int height, int pitch) { + unsigned int i, j; + + for (i = 0; i < height; ++i) { + uint8_t *pos = start + i * pitch; + char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT + + for (j = 0; j < width; ++j) { + int v = pos[j]; + + v = clamp(v - blackclamp[0], 0, 255); + v = clamp(v + bothclamp[0], 0, 255); + v = clamp(v - whiteclamp[0], 0, 255); + + pos[j] = v + ref[j]; + } + } +} + +static double gaussian(double sigma, double mu, double x) { + return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); +} + +int aom_setup_noise(double sigma, int size, char *noise) { + char char_dist[256]; + int next = 0, i, j; + + // set up a 256 entry lookup that matches gaussian distribution + for (i = -32; i < 32; ++i) { + const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i)); + if (a_i) { + for (j = 0; j < a_i; ++j) { + char_dist[next + j] = (char)i; + } + next = next + j; + } + } + + // Rounding error - might mean we have less than 256. + for (; next < 256; ++next) { + char_dist[next] = 0; + } + + for (i = 0; i < size; ++i) { + noise[i] = char_dist[rand() & 0xff]; // NOLINT + } + + // Returns the highest non 0 value used in distribution. + return -char_dist[0]; +} diff --git a/third_party/aom/aom_dsp/ans.h b/third_party/aom/aom_dsp/ans.h new file mode 100644 index 000000000..a7a2f0eab --- /dev/null +++ b/third_party/aom/aom_dsp/ans.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_ANS_H_ +#define AOM_DSP_ANS_H_ +// Constants, types and utilities for Asymmetric Numeral Systems +// http://arxiv.org/abs/1311.2540v2 + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Use windowed ANS, size is passed in at initialization +#define ANS_MAX_SYMBOLS 1 +#define ANS_REVERSE 1 + +typedef uint8_t AnsP8; +#define ANS_P8_PRECISION 256u +#define ANS_P8_SHIFT 8 +#define RANS_PROB_BITS 15 +#define RANS_PRECISION (1u << RANS_PROB_BITS) + +// L_BASE is the ANS base state. L_BASE % PRECISION must be 0. +#define L_BASE (1u << 17) +#define IO_BASE 256 +// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 } + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_ANS_H_ diff --git a/third_party/aom/aom_dsp/ansreader.h b/third_party/aom/aom_dsp/ansreader.h new file mode 100644 index 000000000..e50c63b2d --- /dev/null +++ b/third_party/aom/aom_dsp/ansreader.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_ANSREADER_H_ +#define AOM_DSP_ANSREADER_H_ +// An implementation of Asymmetric Numeral Systems +// http://arxiv.org/abs/1311.2540v2 +// Implements decoding of: +// * rABS (range Asymmetric Binary Systems), a boolean coder +// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" +#include "aom_dsp/ans.h" +#include "aom_ports/mem_ops.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct AnsDecoder { + const uint8_t *buf; + int buf_offset; + uint32_t state; +#if ANS_MAX_SYMBOLS + int symbols_left; + int window_size; +#endif +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif +}; + +static INLINE int ans_read_reinit(struct AnsDecoder *const ans); + +static INLINE unsigned refill_state(struct AnsDecoder *const ans, + unsigned state) { +#if ANS_REVERSE + while (state < L_BASE && ans->buf_offset < 0) { + state = state * IO_BASE + ans->buf[ans->buf_offset++]; + } +#else + while (state < L_BASE && ans->buf_offset > 0) { + state = state * IO_BASE + ans->buf[--ans->buf_offset]; + } +#endif + return state; +} + +// Decode one rABS encoded boolean where the probability of the value being zero +// is p0. +static INLINE int rabs_read(struct AnsDecoder *ans, AnsP8 p0) { +#if ANS_MAX_SYMBOLS + if (ans->symbols_left-- == 0) { + ans_read_reinit(ans); + ans->symbols_left--; + } +#endif + unsigned state = refill_state(ans, ans->state); + const unsigned quotient = state / ANS_P8_PRECISION; + const unsigned remainder = state % ANS_P8_PRECISION; + const int value = remainder >= p0; + const unsigned qp0 = quotient * p0; + if (value) + state = state - qp0 - p0; + else + state = qp0 + remainder; + ans->state = state; + return value; +} + +// Decode one rABS encoded boolean where the probability of the value being zero +// is one half. +static INLINE int rabs_read_bit(struct AnsDecoder *ans) { +#if ANS_MAX_SYMBOLS + if (ans->symbols_left-- == 0) { + ans_read_reinit(ans); + ans->symbols_left--; + } +#endif + unsigned state = refill_state(ans, ans->state); + const int value = !!(state & 0x80); + ans->state = ((state >> 1) & ~0x7F) | (state & 0x7F); + return value; +} + +struct rans_dec_sym { + uint8_t val; + aom_cdf_prob prob; + aom_cdf_prob cum_prob; // not-inclusive +}; + +static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf, + aom_cdf_prob rem) { + int i; + aom_cdf_prob cum_prob = 0, top_prob; + // TODO(skal): if critical, could be a binary search. + // Or, better, an O(1) alias-table. + for (i = 0; rem >= (top_prob = cdf[i]); ++i) { + cum_prob = top_prob; + } + out->val = i; + out->prob = top_prob - cum_prob; + out->cum_prob = cum_prob; +} + +static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) { + unsigned rem; + unsigned quo; + struct rans_dec_sym sym; +#if ANS_MAX_SYMBOLS + if (ans->symbols_left-- == 0) { + ans_read_reinit(ans); + ans->symbols_left--; + } +#endif + ans->state = refill_state(ans, ans->state); + quo = ans->state / RANS_PRECISION; + rem = ans->state % RANS_PRECISION; + fetch_sym(&sym, tab, rem); + ans->state = quo * sym.prob + rem - sym.cum_prob; + return sym.val; +} + +static INLINE int ans_read_init(struct AnsDecoder *const ans, + const uint8_t *const buf, int offset) { + unsigned x; + if (offset < 1) return 1; +#if ANS_REVERSE + ans->buf = buf + offset; + ans->buf_offset = -offset; + x = buf[0]; + if ((x & 0x80) == 0) { // Marker is 0xxx xxxx + if (offset < 2) return 1; + ans->buf_offset += 2; + ans->state = mem_get_be16(buf) & 0x7FFF; +#if L_BASE * IO_BASE > (1 << 23) + } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx + if (offset < 3) return 1; + ans->buf_offset += 3; + ans->state = mem_get_be24(buf) & 0x3FFFFF; + } else { // Marker is 11xx xxxx + if (offset < 4) return 1; + ans->buf_offset += 4; + ans->state = mem_get_be32(buf) & 0x3FFFFFFF; +#else + } else { // Marker is 1xxx xxxx + if (offset < 3) return 1; + ans->buf_offset += 3; + ans->state = mem_get_be24(buf) & 0x7FFFFF; +#endif + } +#else + ans->buf = buf; + x = buf[offset - 1]; + if ((x & 0x80) == 0) { // Marker is 0xxx xxxx + if (offset < 2) return 1; + ans->buf_offset = offset - 2; + ans->state = mem_get_le16(buf + offset - 2) & 0x7FFF; + } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx + if (offset < 3) return 1; + ans->buf_offset = offset - 3; + ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF; + } else if ((x & 0xE0) == 0xE0) { // Marker is 111x xxxx + if (offset < 4) return 1; + ans->buf_offset = offset - 4; + ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF; + } else { + // Marker 110x xxxx implies this byte is a superframe marker + return 1; + } +#endif // ANS_REVERSE +#if CONFIG_ACCOUNTING + ans->accounting = NULL; +#endif + ans->state += L_BASE; + if (ans->state >= L_BASE * IO_BASE) return 1; +#if ANS_MAX_SYMBOLS + assert(ans->window_size > 1); + ans->symbols_left = ans->window_size; +#endif + return 0; +} + +#if ANS_REVERSE +static INLINE int ans_read_reinit(struct AnsDecoder *const ans) { + return ans_read_init(ans, ans->buf + ans->buf_offset, -ans->buf_offset); +} +#endif + +static INLINE int ans_read_end(const struct AnsDecoder *const ans) { + return ans->buf_offset == 0 && ans->state < L_BASE; +} + +static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) { + return ans->state < L_BASE / RANS_PRECISION; +} +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_ANSREADER_H_ diff --git a/third_party/aom/aom_dsp/answriter.h b/third_party/aom/aom_dsp/answriter.h new file mode 100644 index 000000000..353acf1a9 --- /dev/null +++ b/third_party/aom/aom_dsp/answriter.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_ANSWRITER_H_ +#define AOM_DSP_ANSWRITER_H_ +// An implementation of Asymmetric Numeral Systems +// http://arxiv.org/abs/1311.2540v2 +// Implements encoding of: +// * rABS (range Asymmetric Binary Systems), a boolean coder +// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/ans.h" +#include "aom_dsp/prob.h" +#include "aom_ports/mem_ops.h" +#include "av1/common/odintrin.h" + +#if RANS_PRECISION <= OD_DIVU_DMAX +#define ANS_DIVREM(quotient, remainder, dividend, divisor) \ + do { \ + quotient = OD_DIVU_SMALL((dividend), (divisor)); \ + remainder = (dividend) - (quotient) * (divisor); \ + } while (0) +#else +#define ANS_DIVREM(quotient, remainder, dividend, divisor) \ + do { \ + quotient = (dividend) / (divisor); \ + remainder = (dividend) % (divisor); \ + } while (0) +#endif + +#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor)) + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct AnsCoder { + uint8_t *buf; + int buf_offset; + uint32_t state; +}; + +static INLINE void ans_write_init(struct AnsCoder *const ans, + uint8_t *const buf) { + ans->buf = buf; + ans->buf_offset = 0; + ans->state = L_BASE; +} + +static INLINE int ans_write_end(struct AnsCoder *const ans) { + uint32_t state; + int ans_size; + assert(ans->state >= L_BASE); + assert(ans->state < L_BASE * IO_BASE); + state = ans->state - L_BASE; + if (state < (1u << 15)) { + mem_put_le16(ans->buf + ans->buf_offset, (0x00u << 15) + state); + ans_size = ans->buf_offset + 2; +#if ANS_REVERSE +#if L_BASE * IO_BASE > (1 << 23) + } else if (state < (1u << 22)) { + mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state); + ans_size = ans->buf_offset + 3; + } else if (state < (1u << 30)) { + mem_put_le32(ans->buf + ans->buf_offset, (0x03u << 30) + state); + ans_size = ans->buf_offset + 4; +#else + } else if (state < (1u << 23)) { + mem_put_le24(ans->buf + ans->buf_offset, (0x01u << 23) + state); + ans_size = ans->buf_offset + 3; +#endif +#else + } else if (state < (1u << 22)) { + mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state); + ans_size = ans->buf_offset + 3; + } else if (state < (1u << 29)) { + mem_put_le32(ans->buf + ans->buf_offset, (0x07u << 29) + state); + ans_size = ans->buf_offset + 4; +#endif + } else { + assert(0 && "State is too large to be serialized"); + return ans->buf_offset; + } +#if ANS_REVERSE + { + int i; + uint8_t tmp; + for (i = 0; i < (ans_size >> 1); i++) { + tmp = ans->buf[i]; + ans->buf[i] = ans->buf[ans_size - 1 - i]; + ans->buf[ans_size - 1 - i] = tmp; + } + ans->buf += ans_size; + ans->buf_offset = 0; + ans->state = L_BASE; + } +#endif + return ans_size; +} + +// Write one boolean using rABS where p0 is the probability of the value being +// zero. +static INLINE void rabs_write(struct AnsCoder *ans, int value, AnsP8 p0) { + const AnsP8 p = ANS_P8_PRECISION - p0; + const unsigned l_s = value ? p : p0; + unsigned state = ans->state; + while (state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) { + ans->buf[ans->buf_offset++] = state % IO_BASE; + state /= IO_BASE; + } + const unsigned quotient = ANS_DIV8(state, l_s); + const unsigned remainder = state - quotient * l_s; + ans->state = quotient * ANS_P8_PRECISION + remainder + (value ? p0 : 0); +} + +// Encode one symbol using rANS. +// cum_prob: The cumulative probability before this symbol (the offset of +// the symbol in the symbol cycle) +// prob: The probability of this symbol (l_s from the paper) +// RANS_PRECISION takes the place of m from the paper. +static INLINE void rans_write(struct AnsCoder *ans, aom_cdf_prob cum_prob, + aom_cdf_prob prob) { + unsigned quotient, remainder; + while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * prob) { + ans->buf[ans->buf_offset++] = ans->state % IO_BASE; + ans->state /= IO_BASE; + } + ANS_DIVREM(quotient, remainder, ans->state, prob); + ans->state = quotient * RANS_PRECISION + remainder + cum_prob; +} + +#undef ANS_DIV8 +#undef ANS_DIVREM +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_ANSWRITER_H_ diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c new file mode 100644 index 000000000..74f4c00fb --- /dev/null +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -0,0 +1,854 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, int y0_q4, + int y_step_q4, int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, + MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height); + convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, + dst_stride, y_filters, y0_q4, y_step_q4, w, h); +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + w, h); +} + +void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); +} + +void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, + w, h); +} + +void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); +} + +void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h); +} + +void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w, + h); +} + +void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int filter_x_stride, const int16_t *filter_y, + int filter_y_stride, int w, int h) { + int r; + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int filter_x_stride, const int16_t *filter_y, + int filter_y_stride, int w, int h) { + int x, y; + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + + src += src_stride; + dst += dst_stride; + } +} + +void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +#if CONFIG_LOOP_RESTORATION +static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_x[SUBPEL_TAPS / 2 - 1]); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h) { + uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height); + convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, + dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h); +} + +void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); +} + +void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); +} + +void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h); +} +#endif // CONFIG_LOOP_RESTORATION + +#if CONFIG_HIGHBITDEPTH +static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h, int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height, bd); + highbd_convolve_vert( + CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, + filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); + aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst, + dst_stride, NULL, 0, NULL, 0, w, h, bd); +} + +void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int r; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w * sizeof(uint16_t)); + src += src_stride; + dst += dst_stride; + } +} + +void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + } + src += src_stride; + dst += dst_stride; + } +} + +#if CONFIG_LOOP_RESTORATION +static void highbd_convolve_add_src_horiz(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, + int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd( + ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1], + bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_add_src_vert(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_y[(SUBPEL_TAPS / 2 - 1) * src_stride], + bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_add_src_vert( + CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_horiz_c( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd); +} +#endif // CONFIG_LOOP_RESTORATION +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h new file mode 100644 index 000000000..d0de6c5d2 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_convolve.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_DSP_AOM_CONVOLVE_H_ +#define AOM_DSP_AOM_CONVOLVE_H_ + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Note: Fixed size intermediate buffers, place limits on parameters +// of some functions. 2d filtering proceeds in 2 steps: +// (1) Interpolate horizontally into an intermediate buffer, temp. +// (2) Interpolate temp vertically to derive the sub-pixel result. +// Deriving the maximum number of rows in the temp buffer (135): +// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). +// --Largest block size is 64x64 pixels. +// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the +// original frame (in 1/16th pixel units). +// --Must round-up because block may be located at sub-pixel position. +// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. +// --((64 - 1) * 32 + 15) >> 4 + 8 = 135. +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +#define MAX_EXT_SIZE 263 +#else +#define MAX_EXT_SIZE 135 +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION + +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +#if CONFIG_HIGHBITDEPTH +typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_AOM_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake new file mode 100644 index 000000000..f00348cbc --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -0,0 +1,509 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +set(AOM_DSP_COMMON_SOURCES + "${AOM_ROOT}/aom_dsp/aom_convolve.c" + "${AOM_ROOT}/aom_dsp/aom_convolve.h" + "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" + "${AOM_ROOT}/aom_dsp/aom_filter.h" + "${AOM_ROOT}/aom_dsp/aom_simd.h" + "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" + "${AOM_ROOT}/aom_dsp/blend.h" + "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" + "${AOM_ROOT}/aom_dsp/intrapred.c" + "${AOM_ROOT}/aom_dsp/loopfilter.c" + "${AOM_ROOT}/aom_dsp/prob.c" + "${AOM_ROOT}/aom_dsp/prob.h" + "${AOM_ROOT}/aom_dsp/sad.c" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/subtract.c" + "${AOM_ROOT}/aom_dsp/txfm_common.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_intrin.h") + +set(AOM_DSP_COMMON_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm") + +set(AOM_DSP_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" + "${AOM_ROOT}/aom_dsp/x86/convolve.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c") + +set(AOM_DSP_COMMON_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm" + "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.asm") + +set(AOM_DSP_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c") + +set(AOM_DSP_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") + +set(AOM_DSP_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c") + +set(AOM_DSP_COMMON_ASM_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm") + +set(AOM_DSP_COMMON_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c" + "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" + "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" + "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") + +if ("${AOM_TARGET_CPU}" STREQUAL "arm64") + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") +endif () + +set(AOM_DSP_COMMON_INTRIN_DSPR2 + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") + +set(AOM_DSP_COMMON_INTRIN_MSA + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_vert_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_avg_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h" + "${AOM_ROOT}/aom_dsp/mips/fwd_dct32x32_msa.c" + "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.c" + "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.h" + "${AOM_ROOT}/aom_dsp/mips/idct16x16_msa.c" + "${AOM_ROOT}/aom_dsp/mips/idct32x32_msa.c" + "${AOM_ROOT}/aom_dsp/mips/idct4x4_msa.c" + "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h" + "${AOM_ROOT}/aom_dsp/mips/macros_msa.h" + "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h") + +if (CONFIG_HIGHBITDEPTH) + set(AOM_DSP_COMMON_ASM_SSE2 + ${AOM_DSP_COMMON_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm") + + set(AOM_DSP_COMMON_INTRIN_SSE2 + ${AOM_DSP_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") + + set(AOM_DSP_COMMON_INTRIN_AVX2 + ${AOM_DSP_COMMON_INTRIN_AVX2} + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c") +else () + set(AOM_DSP_COMMON_INTRIN_DSPR2 + ${AOM_DSP_COMMON_INTRIN_DSPR2} + "${AOM_ROOT}/aom_dsp/mips/itrans16_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans32_cols_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans32_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans4_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans8_dspr2.c") +endif () + +if (CONFIG_ANS) + set(AOM_DSP_COMMON_SOURCES + ${AOM_DSP_COMMON_SOURCES} + "${AOM_ROOT}/aom_dsp/ans.h") +elseif (CONFIG_DAALA_EC) + set(AOM_DSP_COMMON_SOURCES + ${AOM_DSP_COMMON_SOURCES} + "${AOM_ROOT}/aom_dsp/entcode.c" + "${AOM_ROOT}/aom_dsp/entcode.h") +endif () + +if (CONFIG_AV1) + set(AOM_DSP_COMMON_SOURCES + ${AOM_DSP_COMMON_SOURCES} + "${AOM_ROOT}/aom_dsp/inv_txfm.c" + "${AOM_ROOT}/aom_dsp/inv_txfm.h") + + set(AOM_DSP_COMMON_ASM_SSE2 + ${AOM_DSP_COMMON_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") + + set(AOM_DSP_COMMON_INTRIN_SSE2 + ${AOM_DSP_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.h") +endif () + +if (CONFIG_DECODERS) + set(AOM_DSP_DECODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" + "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" + "${AOM_ROOT}/aom_dsp/bitreader.h" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.h") + + if (CONFIG_ANS) + set(AOM_DSP_DECODER_SOURCES + ${AOM_DSP_DECODER_SOURCES} + "${AOM_ROOT}/aom_dsp/ansreader.h") + elseif (CONFIG_DAALA_EC) + set(AOM_DSP_DECODER_SOURCES + ${AOM_DSP_DECODER_SOURCES} + "${AOM_ROOT}/aom_dsp/daalaboolreader.c" + "${AOM_ROOT}/aom_dsp/daalaboolreader.h" + "${AOM_ROOT}/aom_dsp/entdec.c" + "${AOM_ROOT}/aom_dsp/entdec.h") + else () + set(AOM_DSP_DECODER_SOURCES + ${AOM_DSP_DECODER_SOURCES} + "${AOM_ROOT}/aom_dsp/dkboolreader.c" + "${AOM_ROOT}/aom_dsp/dkboolreader.h") + endif () +endif () + +if (CONFIG_ENCODERS) + set(AOM_DSP_ENCODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" + "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" + "${AOM_ROOT}/aom_dsp/bitwriter.h" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" + "${AOM_ROOT}/aom_dsp/psnr.c" + "${AOM_ROOT}/aom_dsp/psnr.h" + "${AOM_ROOT}/aom_dsp/variance.c" + "${AOM_ROOT}/aom_dsp/variance.h") + + set(AOM_DSP_ENCODER_ASM_SSE2 + ${AOM_DSP_ENCODER_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm") + + set(AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c") + + set(AOM_DSP_ENCODER_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/sad_ssse3.asm") + + set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" + "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm") + + set(AOM_DSP_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/aom_dsp/x86/sad_sse3.asm") + set(AOM_DSP_ENCODER_ASM_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sad_sse4.asm") + + set(AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c") + + if (CONFIG_AV1_ENCODER) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/avg.c" + "${AOM_ROOT}/aom_dsp/fwd_txfm.c" + "${AOM_ROOT}/aom_dsp/fwd_txfm.h" + "${AOM_ROOT}/aom_dsp/quantize.c" + "${AOM_ROOT}/aom_dsp/quantize.h" + "${AOM_ROOT}/aom_dsp/sum_squares.c") + + set(AOM_DSP_ENCODER_INTRIN_SSE2 + ${AOM_DSP_ENCODER_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_dct32_8cols_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c") + + set(AOM_DSP_ENCODER_INTRIN_SSSE3 + ${AOM_DSP_ENCODER_INTRIN_SSSE3} + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c") + + set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64} + "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm" + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") + + set(AOM_DSP_ENCODER_AVX_ASM_X86_64 + ${AOM_DSP_ENCODER_AVX_ASM_X86_64} + "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") + + set(AOM_DSP_ENCODER_INTRIN_MSA + "${AOM_ROOT}/aom_dsp/mips/avg_msa.c" + "${AOM_ROOT}/aom_dsp/mips/sad_msa.c" + "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c" + "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" + "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") + + if (CONFIG_HIGHBITDEPTH) + set(AOM_DSP_ENCODER_INTRIN_SSE2 + ${AOM_DSP_ENCODER_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c") + endif () + endif () + + if (CONFIG_HIGHBITDEPTH) + set(AOM_DSP_ENCODER_ASM_SSE2 + ${AOM_DSP_ENCODER_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm") + + set(AOM_DSP_ENCODER_INTRIN_SSE2 + ${AOM_DSP_ENCODER_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c") + + set(AOM_DSP_ENCODER_INTRIN_SSE4_1 + ${AOM_DSP_ENCODER_INTRIN_SSE4_1} + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c") + + set(AOM_DSP_ENCODER_INTRIN_AVX2 + ${AOM_DSP_ENCODER_INTRIN_AVX2} + "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c") + endif () + + if (CONFIG_ANS) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/answriter.h" + "${AOM_ROOT}/aom_dsp/buf_ans.c" + "${AOM_ROOT}/aom_dsp/buf_ans.h") + elseif (CONFIG_DAALA_EC) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" + "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" + "${AOM_ROOT}/aom_dsp/entenc.c" + "${AOM_ROOT}/aom_dsp/entenc.h") + else () + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/dkboolwriter.c" + "${AOM_ROOT}/aom_dsp/dkboolwriter.h") + endif () + + if (CONFIG_INTERNAL_STATS) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/fastssim.c" + "${AOM_ROOT}/aom_dsp/psnrhvs.c" + "${AOM_ROOT}/aom_dsp/ssim.c" + "${AOM_ROOT}/aom_dsp/ssim.h") + endif () +endif () + +if (CONFIG_MOTION_VAR) + set(AOM_DSP_ENCODER_INTRIN_SSE4_1 + ${AOM_DSP_ENCODER_INTRIN_SSE4_1} + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") +endif () + +# Creates aom_dsp build targets. Must not be called until after libaom target +# has been created. +function (setup_aom_dsp_targets) + add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_common) + target_sources(aom PUBLIC $) + + if (CONFIG_DECODERS) + add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder) + target_sources(aom PUBLIC $) + endif () + + if (CONFIG_ENCODERS) + add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder) + target_sources(aom PUBLIC $) + endif () + + if (HAVE_SSE2) + add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE2") + if (CONFIG_ENCODERS) + add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE2") + endif() + endif () + + if (HAVE_SSE3 AND CONFIG_ENCODERS) + add_asm_library("aom_dsp_encoder_sse3" "AOM_DSP_ENCODER_INTRIN_SSE3" "aom") + endif () + + if (HAVE_SSSE3) + add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSSE3") + + if (CONFIG_ENCODERS) + if ("${AOM_TARGET_CPU}" STREQUAL "x86_64") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 + ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) + endif () + add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSSE3") + endif () + endif () + + if (HAVE_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE4_1") + if (CONFIG_ENCODERS) + if (AOM_DSP_ENCODER_INTRIN_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE4_1") + endif () + add_asm_library("aom_dsp_encoder_sse4_1" "AOM_DSP_ENCODER_ASM_SSE4_1" + "aom") + endif () + endif () + + if (HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") + add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64" + "aom") + endif () + + if (HAVE_AVX2) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_AVX2") + if (CONFIG_ENCODERS) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_AVX2") + endif () + endif () + + if (HAVE_NEON_ASM) + if (AOM_ADS2GAS_REQUIRED) + add_gas_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom") + else () + add_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom") + endif () + endif () + + if (HAVE_NEON) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON") + endif () + + if (HAVE_DSPR2) + add_intrinsics_object_library("" "dspr2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_DSPR2") + endif () + + if (HAVE_MSA) + add_intrinsics_object_library("" "msa" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_MSA") + if (CONFIG_ENCODERS) + add_intrinsics_object_library("" "msa" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_MSA") + endif () + endif () + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction () diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk new file mode 100644 index 000000000..8c7241b83 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp.mk @@ -0,0 +1,428 @@ +## +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## + + +DSP_SRCS-yes += aom_dsp.mk +DSP_SRCS-yes += aom_dsp_common.h + +DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h + +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms.h + +# bit reader +DSP_SRCS-yes += prob.h +DSP_SRCS-yes += prob.c +DSP_SRCS-$(CONFIG_ANS) += ans.h + +ifeq ($(CONFIG_ENCODERS),yes) +ifeq ($(CONFIG_ANS),yes) +DSP_SRCS-yes += answriter.h +DSP_SRCS-yes += buf_ans.h +DSP_SRCS-yes += buf_ans.c +else ifeq ($(CONFIG_DAALA_EC),yes) +DSP_SRCS-yes += entenc.c +DSP_SRCS-yes += entenc.h +DSP_SRCS-yes += daalaboolwriter.c +DSP_SRCS-yes += daalaboolwriter.h +else +DSP_SRCS-yes += dkboolwriter.h +DSP_SRCS-yes += dkboolwriter.c +endif +DSP_SRCS-yes += bitwriter.h +DSP_SRCS-yes += bitwriter_buffer.c +DSP_SRCS-yes += bitwriter_buffer.h +DSP_SRCS-yes += binary_codes_writer.c +DSP_SRCS-yes += binary_codes_writer.h +DSP_SRCS-yes += psnr.c +DSP_SRCS-yes += psnr.h +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c +endif + +ifeq ($(CONFIG_DECODERS),yes) +ifeq ($(CONFIG_ANS),yes) +DSP_SRCS-yes += ansreader.h +else ifeq ($(CONFIG_DAALA_EC),yes) +DSP_SRCS-yes += entdec.c +DSP_SRCS-yes += entdec.h +DSP_SRCS-yes += daalaboolreader.c +DSP_SRCS-yes += daalaboolreader.h +else +DSP_SRCS-yes += dkboolreader.h +DSP_SRCS-yes += dkboolreader.c +endif +DSP_SRCS-yes += bitreader.h +DSP_SRCS-yes += bitreader_buffer.c +DSP_SRCS-yes += bitreader_buffer.h +DSP_SRCS-yes += binary_codes_reader.c +DSP_SRCS-yes += binary_codes_reader.h +endif + +# intra predictions +DSP_SRCS-yes += intrapred.c + +ifeq ($(CONFIG_DAALA_EC),yes) +DSP_SRCS-yes += entcode.c +DSP_SRCS-yes += entcode.h +endif + +DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +endif # CONFIG_HIGHBITDEPTH + +DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c + +DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c + +# inter predictions +DSP_SRCS-yes += blend.h +DSP_SRCS-yes += blend_a64_mask.c +DSP_SRCS-yes += blend_a64_hmask.c +DSP_SRCS-yes += blend_a64_vmask.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c + +# interpolation filters +DSP_SRCS-yes += aom_convolve.c +DSP_SRCS-yes += aom_convolve.h +DSP_SRCS-yes += aom_filter.h + +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c +DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm +DSP_SRCS-$(HAVE_AVX2) += x86/aom_subpixel_8t_intrin_avx2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c +endif +DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve_neon.c +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/aom_convolve_copy_neon.c +DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c +DSP_SRCS-yes += arm/aom_convolve8_neon.c +DSP_SRCS-yes += arm/aom_convolve_avg_neon.c +DSP_SRCS-yes += arm/aom_convolve_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM + +# common (msa) +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h + +# common (dspr2) +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_vert_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c + +# loop filters +DSP_SRCS-yes += loopfilter.c + +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c + +DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/loopfilter_16_neon.c +DSP_SRCS-yes += arm/loopfilter_8_neon.c +DSP_SRCS-yes += arm/loopfilter_4_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM + +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_16_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_4_msa.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_macros_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c +endif # CONFIG_HIGHBITDEPTH + +DSP_SRCS-yes += txfm_common.h +DSP_SRCS-yes += x86/txfm_common_intrin.h +DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h +DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h + +# forward transform +ifneq ($(findstring yes,$(CONFIG_AV1)$(CONFIG_PVQ)),) +DSP_SRCS-$(HAVE_AVX2) += x86/txfm_common_avx2.h +ifeq ($(CONFIG_AV1_ENCODER),yes) +DSP_SRCS-yes += fwd_txfm.c +DSP_SRCS-yes += fwd_txfm.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32_8cols_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm +endif +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.h +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h +DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c +endif # CONFIG_AV1_ENCODER +endif # CONFIG_AV1 + +# inverse transform +ifeq ($(CONFIG_AV1), yes) +DSP_SRCS-yes += inv_txfm.h +DSP_SRCS-yes += inv_txfm.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/save_reg_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) +DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM) +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/idct4x4_1_add_neon.c +DSP_SRCS-yes += arm/idct4x4_add_neon.c +DSP_SRCS-yes += arm/idct8x8_1_add_neon.c +DSP_SRCS-yes += arm/idct8x8_add_neon.c +DSP_SRCS-yes += arm/idct16x16_1_add_neon.c +DSP_SRCS-yes += arm/idct16x16_add_neon.c +DSP_SRCS-yes += arm/idct32x32_1_add_neon.c +DSP_SRCS-yes += arm/idct32x32_add_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c + +DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c + +ifneq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c +endif # CONFIG_HIGHBITDEPTH +endif # CONFIG_AV1 + +# quantization +ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),) +DSP_SRCS-yes += quantize.c +DSP_SRCS-yes += quantize.h + +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c +endif +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm +DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm +endif + +# avg +DSP_SRCS-yes += avg.c +DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c +DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c +DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm +endif + +# high bit depth subtract +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subtract_sse2.c +endif + +endif # CONFIG_AV1_ENCODER + +ifeq ($(CONFIG_AV1_ENCODER),yes) +DSP_SRCS-yes += sum_squares.c + +DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c +endif # CONFIG_AV1_ENCODER + +ifeq ($(CONFIG_ENCODERS),yes) +DSP_SRCS-yes += sad.c +DSP_SRCS-yes += subtract.c + +DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c + +DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c + +DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm +DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm +DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c +endif + +ifeq ($(CONFIG_AV1_ENCODER),yes) +ifeq ($(CONFIG_EXT_INTER),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c +endif #CONFIG_EXT_INTER +ifeq ($(CONFIG_MOTION_VAR),yes) +DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c +endif #CONFIG_MOTION_VAR +ifeq ($(CONFIG_EXT_PARTITION),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/sad_impl_avx2.c +endif +endif #CONFIG_AV1_ENCODER + +DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +endif # CONFIG_HIGHBITDEPTH + +endif # CONFIG_ENCODERS + +ifneq ($(filter yes,$(CONFIG_ENCODERS)),) +DSP_SRCS-yes += variance.c +DSP_SRCS-yes += variance.h + +DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c + +DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c + +DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 +DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c + +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm +endif # ARCH_X86_64 + +DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3 + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +endif # CONFIG_HIGHBITDEPTH +endif # CONFIG_ENCODERS + +DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) + +DSP_SRCS-yes += aom_dsp_rtcd.c +DSP_SRCS-yes += aom_dsp_rtcd_defs.pl + +DSP_SRCS-yes += aom_simd.h +DSP_SRCS-yes += aom_simd_inline.h +DSP_SRCS-yes += simd/v64_intrinsics.h +DSP_SRCS-yes += simd/v64_intrinsics_c.h +DSP_SRCS-yes += simd/v128_intrinsics.h +DSP_SRCS-yes += simd/v128_intrinsics_c.h +DSP_SRCS-yes += simd/v256_intrinsics.h +DSP_SRCS-yes += simd/v256_intrinsics_c.h +DSP_SRCS-yes += simd/v256_intrinsics_v128.h +DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h +DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h +DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h +DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h +DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h +DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h + +$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl)) diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h new file mode 100644 index 000000000..47ffbeb6c --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_AOM_DSP_COMMON_H_ +#define AOM_DSP_AOM_DSP_COMMON_H_ + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MAX_SB_SIZE +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +#define MAX_SB_SIZE 128 +#else +#define MAX_SB_SIZE 64 +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION +#endif // ndef MAX_SB_SIZE + +#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) +#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) + +#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') + +#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) + +/* Left shifting a negative value became undefined behavior in C99 (downgraded + from merely implementation-defined in C89). This should still compile to the + correct thing on any two's-complement machine, but avoid ubsan warnings.*/ +#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift))) + +// These can be used to give a hint about branch outcomes. +// This can have an effect, even if your target processor has a +// good branch predictor, as these hints can affect basic block +// ordering by the compiler. +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +#define AOM_SWAP(type, a, b) \ + do { \ + type c = (b); \ + b = a; \ + a = c; \ + } while (0) + +#if CONFIG_AOM_QM +typedef uint16_t qm_val_t; +#define AOM_QM_BITS 6 +#endif +#if CONFIG_HIGHBITDEPTH +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int64_t tran_high_t; +typedef int32_t tran_low_t; +#else +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int32_t tran_high_t; +typedef int16_t tran_low_t; +#endif // CONFIG_HIGHBITDEPTH + +static INLINE uint8_t clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} + +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE double fclamp(double value, double low, double high) { + return value < low ? low : (value > high ? high : value); +} + +#if CONFIG_HIGHBITDEPTH +static INLINE uint16_t clip_pixel_highbd(int val, int bd) { + switch (bd) { + case 8: + default: return (uint16_t)clamp(val, 0, 255); + case 10: return (uint16_t)clamp(val, 0, 1023); + case 12: return (uint16_t)clamp(val, 0, 4095); + } +} +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_AOM_DSP_COMMON_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c new file mode 100644 index 000000000..11a57d382 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "./aom_config.h" +#define RTCD_C +#include "./aom_dsp_rtcd.h" +#include "aom_ports/aom_once.h" + +void aom_dsp_rtcd() { once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl new file mode 100755 index 000000000..b4ef0d92f --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -0,0 +1,1495 @@ +sub aom_dsp_forward_decls() { +print < + +#if defined(_WIN32) +#include +#endif + +#include "./aom_config.h" +#include "./aom_simd_inline.h" + +#define SIMD_CHECK 1 // Sanity checks in C equivalents + +#if HAVE_NEON +#include "simd/v256_intrinsics_arm.h" +// VS compiling for 32 bit targets does not support vector types in +// structs as arguments, which makes the v256 type of the intrinsics +// hard to support, so optimizations for this target are disabled. +#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) +#include "simd/v256_intrinsics_x86.h" +#else +#include "simd/v256_intrinsics.h" +#endif + +#endif // AOM_DSP_AOM_AOM_SIMD_H_ diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h new file mode 100644 index 000000000..02a8b3a17 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_simd_inline.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_AOM_SIMD_INLINE_H_ +#define AOM_DSP_AOM_SIMD_INLINE_H_ + +#include "aom/aom_integer.h" + +#ifndef SIMD_INLINE +#define SIMD_INLINE static AOM_FORCE_INLINE +#endif + +#endif // AOM_DSP_AOM_SIMD_INLINE_H_ diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c new file mode 100644 index 000000000..09429d6d2 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, + int16x4_t dsrc2, int16x4_t dsrc3, + int16x4_t dsrc4, int16x4_t dsrc5, + int16x4_t dsrc6, int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h) { + int width; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_y; + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = + vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + src += 7; + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz + s = src; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(src + 64); + + d0x2u16 = + vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); + d1x2u16 = + vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(src + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = + vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(src + 64 + src_stride * 2); + + d = dst; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, + d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + __builtin_prefetch(src + 64 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + d = dst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + src += src_stride * 4 - w - 7; + dst += dst_stride * 4 - w; + } + return; +} + +void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + uint8x16_t q1u8, q3u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_x; + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + d -= dst_stride * 3; + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, q0s16); + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, + d26s16, d27s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + q1u8 = vcombine_u8(d2u8, d3u8); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm new file mode 100644 index 000000000..80aef992d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + ; These functions are only valid when: + ; x_step_q4 == 16 + ; w%4 == 0 + ; h%4 == 0 + ; taps == 8 + ; AV1_FILTER_WEIGHT == 128 + ; AV1_FILTER_SHIFT == 7 + + EXPORT |aom_convolve8_avg_horiz_neon| + EXPORT |aom_convolve8_avg_vert_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Multiply and accumulate by q0 + MACRO + MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 + vmull.s16 $dst, $src0, d0[0] + vmlal.s16 $dst, $src1, d0[1] + vmlal.s16 $dst, $src2, d0[2] + vmlal.s16 $dst, $src3, d0[3] + vmlal.s16 $dst, $src4, d1[0] + vmlal.s16 $dst, $src5, d1[1] + vmlal.s16 $dst, $src6, d1[2] + vmlal.s16 $dst, $src7, d1[3] + MEND + +; r0 const uint8_t *src +; r1 int src_stride +; r2 uint8_t *dst +; r3 int dst_stride +; sp[]const int16_t *filter_x +; sp[]int x_step_q4 +; sp[]const int16_t *filter_y ; unused +; sp[]int y_step_q4 ; unused +; sp[]int w +; sp[]int h + +|aom_convolve8_avg_horiz_neon| PROC + push {r4-r10, lr} + + sub r0, r0, #3 ; adjust for taps + + ldr r5, [sp, #32] ; filter_x + ldr r6, [sp, #48] ; w + ldr r7, [sp, #52] ; h + + vld1.s16 {q0}, [r5] ; filter_x + + sub r8, r1, r1, lsl #2 ; -src_stride * 3 + add r8, r8, #4 ; -src_stride * 3 + 4 + + sub r4, r3, r3, lsl #2 ; -dst_stride * 3 + add r4, r4, #4 ; -dst_stride * 3 + 4 + + rsb r9, r6, r1, lsl #2 ; reset src for outer loop + sub r9, r9, #7 + rsb r12, r6, r3, lsl #2 ; reset dst for outer loop + + mov r10, r6 ; w loop counter + +aom_convolve8_avg_loop_horiz_v + vld1.8 {d24}, [r0], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d27}, [r0], r8 + + vtrn.16 q12, q13 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + + pld [r0, r1, lsl #2] + + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + vmovl.u8 q10, d26 + vmovl.u8 q11, d27 + + ; save a few instructions in the inner loop + vswp d17, d18 + vmov d23, d21 + + add r0, r0, #3 + +aom_convolve8_avg_loop_horiz + add r5, r0, #64 + + vld1.32 {d28[]}, [r0], r1 + vld1.32 {d29[]}, [r0], r1 + vld1.32 {d31[]}, [r0], r1 + vld1.32 {d30[]}, [r0], r8 + + pld [r5] + + vtrn.16 d28, d31 + vtrn.16 d29, d30 + vtrn.8 d28, d29 + vtrn.8 d31, d30 + + pld [r5, r1] + + ; extract to s16 + vtrn.32 q14, q15 + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + + pld [r5, r1, lsl #1] + + ; slightly out of order load to match the existing data + vld1.u32 {d6[0]}, [r2], r3 + vld1.u32 {d7[0]}, [r2], r3 + vld1.u32 {d6[1]}, [r2], r3 + vld1.u32 {d7[1]}, [r2], r3 + + sub r2, r2, r3, lsl #2 ; reset for store + + ; src[] * filter_x + MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 + MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 + MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 + MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 + + pld [r5, -r8] + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + ; transpose + vtrn.16 d2, d3 + vtrn.32 d2, d3 + vtrn.8 d2, d3 + + ; average the new value and the dst value + vrhadd.u8 q1, q1, q3 + + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 + + vmov q8, q9 + vmov d20, d23 + vmov q11, q12 + vmov q9, q13 + + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_avg_loop_horiz + + ; outer loop + mov r6, r10 ; restore w counter + add r0, r0, r9 ; src += src_stride * 4 - w + add r2, r2, r12 ; dst += dst_stride * 4 - w + subs r7, r7, #4 ; h -= 4 + bgt aom_convolve8_avg_loop_horiz_v + + pop {r4-r10, pc} + + ENDP + +|aom_convolve8_avg_vert_neon| PROC + push {r4-r8, lr} + + ; adjust for taps + sub r0, r0, r1 + sub r0, r0, r1, lsl #1 + + ldr r4, [sp, #32] ; filter_y + ldr r6, [sp, #40] ; w + ldr lr, [sp, #44] ; h + + vld1.s16 {q0}, [r4] ; filter_y + + lsl r1, r1, #1 + lsl r3, r3, #1 + +aom_convolve8_avg_loop_vert_h + mov r4, r0 + add r7, r0, r1, asr #1 + mov r5, r2 + add r8, r2, r3, asr #1 + mov r12, lr ; h loop counter + + vld1.u32 {d16[0]}, [r4], r1 + vld1.u32 {d16[1]}, [r7], r1 + vld1.u32 {d18[0]}, [r4], r1 + vld1.u32 {d18[1]}, [r7], r1 + vld1.u32 {d20[0]}, [r4], r1 + vld1.u32 {d20[1]}, [r7], r1 + vld1.u32 {d22[0]}, [r4], r1 + + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + +aom_convolve8_avg_loop_vert + ; always process a 4x4 block at a time + vld1.u32 {d24[0]}, [r7], r1 + vld1.u32 {d26[0]}, [r4], r1 + vld1.u32 {d26[1]}, [r7], r1 + vld1.u32 {d24[1]}, [r4], r1 + + ; extract to s16 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + + vld1.u32 {d6[0]}, [r5@32], r3 + vld1.u32 {d6[1]}, [r8@32], r3 + vld1.u32 {d7[0]}, [r5@32], r3 + vld1.u32 {d7[1]}, [r8@32], r3 + + pld [r7] + pld [r4] + + ; src[] * filter_y + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 + + pld [r7, r1] + pld [r4, r1] + + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 + + pld [r5] + pld [r8] + + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 + + pld [r5, r3] + pld [r8, r3] + + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + ; average the new value and the dst value + vrhadd.u8 q1, q1, q3 + + sub r5, r5, r3, lsl #1 ; reset for store + sub r8, r8, r3, lsl #1 + + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 + + vmov q8, q10 + vmov d18, d22 + vmov d19, d24 + vmov q10, q13 + vmov d22, d25 + + subs r12, r12, #4 ; h -= 4 + bgt aom_convolve8_avg_loop_vert + + ; outer loop + add r0, r0, #4 + add r2, r2, #4 + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_avg_loop_vert_h + + pop {r4-r8, pc} + + ENDP + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c new file mode 100644 index 000000000..8ebffb5f9 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, + int16x4_t dsrc2, int16x4_t dsrc3, + int16x4_t dsrc4, int16x4_t dsrc5, + int16x4_t dsrc6, int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h) { + int width; + const uint8_t *s, *psrc; + uint8_t *d, *pdst; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_y; + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4, src += src_stride * 4, + dst += dst_stride * 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = + vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + __builtin_prefetch(src + src_stride * 6); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w, psrc = src + 7, pdst = dst; width > 0; + width -= 4, psrc += 4, pdst += 4) { // loop_horiz + s = psrc; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(psrc + 64); + + d0x2u16 = + vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); + d1x2u16 = + vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(psrc + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = + vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(psrc + 64 + src_stride * 2); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, + d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + __builtin_prefetch(psrc + 60 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); + d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); + + d = pdst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + } + return; +} + +void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint32x2_t d2u32, d3u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_x; + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, q0s16); + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, + d26s16, d27s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); + d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm new file mode 100644 index 000000000..38207d864 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm @@ -0,0 +1,273 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + ; These functions are only valid when: + ; x_step_q4 == 16 + ; w%4 == 0 + ; h%4 == 0 + ; taps == 8 + ; AV1_FILTER_WEIGHT == 128 + ; AV1_FILTER_SHIFT == 7 + + EXPORT |aom_convolve8_horiz_neon| + EXPORT |aom_convolve8_vert_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Multiply and accumulate by q0 + MACRO + MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 + vmull.s16 $dst, $src0, d0[0] + vmlal.s16 $dst, $src1, d0[1] + vmlal.s16 $dst, $src2, d0[2] + vmlal.s16 $dst, $src3, d0[3] + vmlal.s16 $dst, $src4, d1[0] + vmlal.s16 $dst, $src5, d1[1] + vmlal.s16 $dst, $src6, d1[2] + vmlal.s16 $dst, $src7, d1[3] + MEND + +; r0 const uint8_t *src +; r1 int src_stride +; r2 uint8_t *dst +; r3 int dst_stride +; sp[]const int16_t *filter_x +; sp[]int x_step_q4 +; sp[]const int16_t *filter_y ; unused +; sp[]int y_step_q4 ; unused +; sp[]int w +; sp[]int h + +|aom_convolve8_horiz_neon| PROC + push {r4-r10, lr} + + sub r0, r0, #3 ; adjust for taps + + ldr r5, [sp, #32] ; filter_x + ldr r6, [sp, #48] ; w + ldr r7, [sp, #52] ; h + + vld1.s16 {q0}, [r5] ; filter_x + + sub r8, r1, r1, lsl #2 ; -src_stride * 3 + add r8, r8, #4 ; -src_stride * 3 + 4 + + sub r4, r3, r3, lsl #2 ; -dst_stride * 3 + add r4, r4, #4 ; -dst_stride * 3 + 4 + + rsb r9, r6, r1, lsl #2 ; reset src for outer loop + sub r9, r9, #7 + rsb r12, r6, r3, lsl #2 ; reset dst for outer loop + + mov r10, r6 ; w loop counter + +aom_convolve8_loop_horiz_v + vld1.8 {d24}, [r0], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d27}, [r0], r8 + + vtrn.16 q12, q13 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + + pld [r0, r1, lsl #2] + + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + vmovl.u8 q10, d26 + vmovl.u8 q11, d27 + + ; save a few instructions in the inner loop + vswp d17, d18 + vmov d23, d21 + + add r0, r0, #3 + +aom_convolve8_loop_horiz + add r5, r0, #64 + + vld1.32 {d28[]}, [r0], r1 + vld1.32 {d29[]}, [r0], r1 + vld1.32 {d31[]}, [r0], r1 + vld1.32 {d30[]}, [r0], r8 + + pld [r5] + + vtrn.16 d28, d31 + vtrn.16 d29, d30 + vtrn.8 d28, d29 + vtrn.8 d31, d30 + + pld [r5, r1] + + ; extract to s16 + vtrn.32 q14, q15 + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + + pld [r5, r1, lsl #1] + + ; src[] * filter_x + MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 + MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 + MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 + MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 + + pld [r5, -r8] + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + ; transpose + vtrn.16 d2, d3 + vtrn.32 d2, d3 + vtrn.8 d2, d3 + + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 + + vmov q8, q9 + vmov d20, d23 + vmov q11, q12 + vmov q9, q13 + + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_loop_horiz + + ; outer loop + mov r6, r10 ; restore w counter + add r0, r0, r9 ; src += src_stride * 4 - w + add r2, r2, r12 ; dst += dst_stride * 4 - w + subs r7, r7, #4 ; h -= 4 + bgt aom_convolve8_loop_horiz_v + + pop {r4-r10, pc} + + ENDP + +|aom_convolve8_vert_neon| PROC + push {r4-r8, lr} + + ; adjust for taps + sub r0, r0, r1 + sub r0, r0, r1, lsl #1 + + ldr r4, [sp, #32] ; filter_y + ldr r6, [sp, #40] ; w + ldr lr, [sp, #44] ; h + + vld1.s16 {q0}, [r4] ; filter_y + + lsl r1, r1, #1 + lsl r3, r3, #1 + +aom_convolve8_loop_vert_h + mov r4, r0 + add r7, r0, r1, asr #1 + mov r5, r2 + add r8, r2, r3, asr #1 + mov r12, lr ; h loop counter + + vld1.u32 {d16[0]}, [r4], r1 + vld1.u32 {d16[1]}, [r7], r1 + vld1.u32 {d18[0]}, [r4], r1 + vld1.u32 {d18[1]}, [r7], r1 + vld1.u32 {d20[0]}, [r4], r1 + vld1.u32 {d20[1]}, [r7], r1 + vld1.u32 {d22[0]}, [r4], r1 + + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + +aom_convolve8_loop_vert + ; always process a 4x4 block at a time + vld1.u32 {d24[0]}, [r7], r1 + vld1.u32 {d26[0]}, [r4], r1 + vld1.u32 {d26[1]}, [r7], r1 + vld1.u32 {d24[1]}, [r4], r1 + + ; extract to s16 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + + pld [r5] + pld [r8] + + ; src[] * filter_y + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 + + pld [r5, r3] + pld [r8, r3] + + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 + + pld [r7] + pld [r4] + + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 + + pld [r7, r1] + pld [r4, r1] + + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 + + vmov q8, q10 + vmov d18, d22 + vmov d19, d24 + vmov q10, q13 + vmov d22, d25 + + subs r12, r12, #4 ; h -= 4 + bgt aom_convolve8_loop_vert + + ; outer loop + add r0, r0, #4 + add r2, r2, #4 + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_loop_vert_h + + pop {r4-r8, pc} + + ENDP + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c new file mode 100644 index 000000000..f05d3ceae --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_convolve_avg_neon(const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { + uint8_t *d; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint32x2_t d0u32, d2u32; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + d = dst; + if (w > 32) { // avg64 + for (; h > 0; h -= 1) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + q10u8 = vld1q_u8(d + 32); + q11u8 = vld1q_u8(d + 48); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // avg32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + d += dst_stride; + q10u8 = vld1q_u8(d); + q11u8 = vld1q_u8(d + 16); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // avg16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(d); + d += dst_stride; + q3u8 = vld1q_u8(d); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q1u8 = vrhaddq_u8(q1u8, q3u8); + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // avg8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d1u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(d); + d += dst_stride; + d3u8 = vld1_u8(d); + d += dst_stride; + + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + + vst1_u8(dst, vget_low_u8(q0u8)); + dst += dst_stride; + vst1_u8(dst, vget_high_u8(q0u8)); + dst += dst_stride; + } + } else { // avg4 + for (; h > 0; h -= 2) { + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); + src += src_stride; + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); + src += src_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); + d += dst_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); + d += dst_stride; + + d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32)); + + d0u32 = vreinterpret_u32_u8(d0u8); + vst1_lane_u32((uint32_t *)dst, d0u32, 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, d0u32, 1); + dst += dst_stride; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm new file mode 100644 index 000000000..43c300954 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm @@ -0,0 +1,119 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_convolve_avg_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|aom_convolve_avg_neon| PROC + push {r4-r6, lr} + ldrd r4, r5, [sp, #32] + mov r6, r2 + + cmp r4, #32 + bgt avg64 + beq avg32 + cmp r4, #8 + bgt avg16 + beq avg8 + b avg4 + +avg64 + sub lr, r1, #32 + sub r4, r3, #32 +avg64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + pld [r2, r3] + vld1.8 {q8-q9}, [r6@128]! + vld1.8 {q10-q11}, [r6@128], r4 + vrhadd.u8 q0, q0, q8 + vrhadd.u8 q1, q1, q9 + vrhadd.u8 q2, q2, q10 + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r4 + subs r5, r5, #1 + bgt avg64_h + pop {r4-r6, pc} + +avg32 + vld1.8 {q0-q1}, [r0], r1 + vld1.8 {q2-q3}, [r0], r1 + vld1.8 {q8-q9}, [r6@128], r3 + vld1.8 {q10-q11}, [r6@128], r3 + pld [r0] + vrhadd.u8 q0, q0, q8 + pld [r0, r1] + vrhadd.u8 q1, q1, q9 + pld [r6] + vrhadd.u8 q2, q2, q10 + pld [r6, r3] + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt avg32 + pop {r4-r6, pc} + +avg16 + vld1.8 {q0}, [r0], r1 + vld1.8 {q1}, [r0], r1 + vld1.8 {q2}, [r6@128], r3 + vld1.8 {q3}, [r6@128], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q2 + pld [r6] + pld [r6, r3] + vrhadd.u8 q1, q1, q3 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt avg16 + pop {r4-r6, pc} + +avg8 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d2}, [r6@64], r3 + vld1.8 {d3}, [r6@64], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q1 + pld [r6] + pld [r6, r3] + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d1}, [r2@64], r3 + subs r5, r5, #2 + bgt avg8 + pop {r4-r6, pc} + +avg4 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[0]}, [r6@32], r3 + vld1.32 {d2[1]}, [r6@32], r3 + vrhadd.u8 d0, d0, d2 + vst1.32 {d0[0]}, [r2@32], r3 + vst1.32 {d0[1]}, [r2@32], r3 + subs r5, r5, #2 + bgt avg4 + pop {r4-r6, pc} + ENDP + + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c new file mode 100644 index 000000000..9e57c7176 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_convolve_copy_neon(const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { + uint8x8_t d0u8, d2u8; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + if (w > 32) { // copy64 + for (; h > 0; h--) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // copy32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // copy16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // copy8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, d0u8); + dst += dst_stride; + vst1_u8(dst, d2u8); + dst += dst_stride; + } + } else { // copy4 + for (; h > 0; h--) { + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm new file mode 100644 index 000000000..443d7178a --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm @@ -0,0 +1,87 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_convolve_copy_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|aom_convolve_copy_neon| PROC + push {r4-r5, lr} + ldrd r4, r5, [sp, #28] + + cmp r4, #32 + bgt copy64 + beq copy32 + cmp r4, #8 + bgt copy16 + beq copy8 + b copy4 + +copy64 + sub lr, r1, #32 + sub r3, r3, #32 +copy64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #1 + bgt copy64_h + pop {r4-r5, pc} + +copy32 + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q2-q3}, [r0], r1 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt copy32 + pop {r4-r5, pc} + +copy16 + pld [r0, r1, lsl #1] + vld1.8 {q0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q1}, [r0], r1 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt copy16 + pop {r4-r5, pc} + +copy8 + pld [r0, r1, lsl #1] + vld1.8 {d0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {d2}, [r0], r1 + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d2}, [r2@64], r3 + subs r5, r5, #2 + bgt copy8 + pop {r4-r5, pc} + +copy4 + ldr r12, [r0], r1 + str r12, [r2], r3 + subs r5, r5, #1 + bgt copy4 + pop {r4-r5, pc} + ENDP + + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c new file mode 100644 index 000000000..6c2997e04 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). + */ + DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + + // Account for the vertical phase needing 3 lines prior and 4 lines post + int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. The neon implementation will ignore the + * given height and filter a multiple of 4 lines. Since this goes in to + * the temp buffer which has lots of extra room and is subsequently discarded + * this is safe if somewhat less than ideal. + */ + aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, + intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, + intermediate_height); + aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c new file mode 100644 index 000000000..e730ccbcc --- /dev/null +++ b/third_party/aom/aom_dsp/arm/avg_neon.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" + +#include "aom/aom_integer.h" + +static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { + const uint32x4_t a = vpaddlq_u16(v_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) { + uint16x8_t v_sum; + uint32x2_t v_s0 = vdup_n_u32(0); + uint32x2_t v_s1 = vdup_n_u32(0); + v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); + v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); + v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); + v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); + v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); + return (horizontal_add_u16x8(v_sum) + 8) >> 4; +} + +unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) { + uint8x8_t v_s0 = vld1_u8(s); + const uint8x8_t v_s1 = vld1_u8(s + p); + uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + + v_s0 = vld1_u8(s + 2 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 3 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 4 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 5 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 6 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 7 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + return (horizontal_add_u16x8(v_sum) + 32) >> 6; +} + +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int aom_satd_neon(const int16_t *coeff, int length) { + const int16x4_t zero = vdup_n_s16(0); + int32x4_t accum = vdupq_n_s32(0); + + do { + const int16x8_t src0 = vld1q_s16(coeff); + const int16x8_t src8 = vld1q_s16(coeff + 8); + accum = vabal_s16(accum, vget_low_s16(src0), zero); + accum = vabal_s16(accum, vget_high_s16(src0), zero); + accum = vabal_s16(accum, vget_low_s16(src8), zero); + accum = vabal_s16(accum, vget_high_s16(src8), zero); + length -= 16; + coeff += 16; + } while (length != 0); + + { + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int satd = vget_lane_s32(s1, 0); + return satd; + } +} + +void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, int ref_stride, + int height) { + int i; + uint16x8_t vec_sum_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_hi = vdupq_n_u16(0); + const int shift_factor = ((height >> 5) + 3) * -1; + const int16x8_t vec_shift = vdupq_n_s16(shift_factor); + + for (i = 0; i < height; i += 8) { + const uint8x16_t vec_row1 = vld1q_u8(ref); + const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); + const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); + const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); + const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); + const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); + const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); + const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); + + ref += ref_stride * 8; + } + + vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); + vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); + + vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); + hbuf += 8; + vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); +} + +int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) { + int i; + uint16x8_t vec_sum = vdupq_n_u16(0); + + for (i = 0; i < width; i += 16) { + const uint8x16_t vec_row = vld1q_u8(ref); + vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); + vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); + ref += 16; + } + + return horizontal_add_u16x8(vec_sum); +} + +// ref, src = [0, 510] - max diff = 16-bits +// bwl = {2, 3, 4}, width = {16, 32, 64} +int aom_vector_var_neon(int16_t const *ref, int16_t const *src, int bwl) { + int width = 4 << bwl; + int32x4_t sse = vdupq_n_s32(0); + int16x8_t total = vdupq_n_s16(0); + + assert(width >= 8); + assert((width % 8) == 0); + + do { + const int16x8_t r = vld1q_s16(ref); + const int16x8_t s = vld1q_s16(src); + const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. + const int16x4_t diff_lo = vget_low_s16(diff); + const int16x4_t diff_hi = vget_high_s16(diff); + sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. + sse = vmlal_s16(sse, diff_hi, diff_hi); + total = vaddq_s16(total, diff); // dynamic range 16 bits. + + ref += 8; + src += 8; + width -= 8; + } while (width != 0); + + { + // Note: 'total''s pairwise addition could be implemented similarly to + // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired + // with the summation of 'sse' performed better on a Cortex-A15. + const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' + const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); + const int32x2_t t2 = vpadd_s32(t1, t1); + const int t = vget_lane_s32(t2, 0); + const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int s = vget_lane_s32(s1, 0); + const int shift_factor = bwl + 2; + return s - ((t * t) >> shift_factor); + } +} + +void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min, int *max) { + // Load and concatenate. + const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride)); + const uint8x16_t a23 = + vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride)); + const uint8x16_t a45 = + vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride)); + const uint8x16_t a67 = + vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride)); + + const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride)); + const uint8x16_t b23 = + vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride)); + const uint8x16_t b45 = + vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride)); + const uint8x16_t b67 = + vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride)); + + // Absolute difference. + const uint8x16_t ab01_diff = vabdq_u8(a01, b01); + const uint8x16_t ab23_diff = vabdq_u8(a23, b23); + const uint8x16_t ab45_diff = vabdq_u8(a45, b45); + const uint8x16_t ab67_diff = vabdq_u8(a67, b67); + + // Max values between the Q vectors. + const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); + const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); + + const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); + const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); + + // Split to D and start doing pairwise. + uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); + uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); + + // Enough runs of vpmax/min propogate the max/min values to every position. + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u8((uint8_t *)max, ab_max, 0); + vst1_lane_u8((uint8_t *)min, ab_min, 0); +} diff --git a/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm new file mode 100644 index 000000000..17b7d25f9 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm @@ -0,0 +1,240 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_filter_block2d_bil_first_pass_media| + EXPORT |aom_filter_block2d_bil_second_pass_media| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *dst_ptr, +; r2 unsigned int src_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *aom_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|aom_filter_block2d_bil_first_pass_media| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; aom_filter address + ldr r4, [sp, #36] ; width + + mov r12, r3 ; outer-loop counter + + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save dst_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |aom_filter_block2d_bil_first_pass_media| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *dst_ptr, +; r2 int dst_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *aom_filter +;--------------------------------- +|aom_filter_block2d_bil_second_pass_media| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; aom_filter address + ldr r4, [sp, #36] ; width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |aom_filter_block2d_second_pass_media| + + END diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c new file mode 100644 index 000000000..1cf8a3a6e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + int i; + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from aom_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + +void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c new file mode 100644 index 000000000..9baefae47 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/hadamard_neon.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + const int16x8_t b0 = vaddq_s16(*a0, *a1); + const int16x8_t b1 = vsubq_s16(*a0, *a1); + const int16x8_t b2 = vaddq_s16(*a2, *a3); + const int16x8_t b3 = vsubq_s16(*a2, *a3); + const int16x8_t b4 = vaddq_s16(*a4, *a5); + const int16x8_t b5 = vsubq_s16(*a4, *a5); + const int16x8_t b6 = vaddq_s16(*a6, *a7); + const int16x8_t b7 = vsubq_s16(*a6, *a7); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + const int16x8_t c4 = vaddq_s16(b4, b6); + const int16x8_t c5 = vaddq_s16(b5, b7); + const int16x8_t c6 = vsubq_s16(b4, b6); + const int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a1 = vsubq_s16(c2, c6); + *a2 = vsubq_s16(c0, c4); + *a3 = vaddq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); + *a6 = vsubq_s16(c1, c5); + *a7 = vaddq_s16(c1, c5); +} + +// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider +// reversing transpose order which may make it easier for the compiler to +// reconcile the vtrn.64 moves. +static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + // Swap 64 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 08 09 10 11 12 13 14 15 + // a2: 16 17 18 19 20 21 22 23 + // a3: 24 25 26 27 28 29 30 31 + // a4: 32 33 34 35 36 37 38 39 + // a5: 40 41 42 43 44 45 46 47 + // a6: 48 49 50 51 52 53 54 55 + // a7: 56 57 58 59 60 61 62 63 + // to: + // a04_lo: 00 01 02 03 32 33 34 35 + // a15_lo: 08 09 10 11 40 41 42 43 + // a26_lo: 16 17 18 19 48 49 50 51 + // a37_lo: 24 25 26 27 56 57 58 59 + // a04_hi: 04 05 06 07 36 37 38 39 + // a15_hi: 12 13 14 15 44 45 46 47 + // a26_hi: 20 21 22 23 52 53 54 55 + // a37_hi: 28 29 30 31 60 61 62 63 + const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4)); + const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5)); + const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6)); + const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7)); + const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4)); + const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5)); + const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6)); + const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7)); + + // Swap 32 bit elements resulting in: + // a0246_lo: + // 00 01 16 17 32 33 48 49 + // 02 03 18 19 34 35 50 51 + // a1357_lo: + // 08 09 24 25 40 41 56 57 + // 10 11 26 27 42 43 58 59 + // a0246_hi: + // 04 05 20 21 36 37 52 53 + // 06 07 22 23 38 39 54 55 + // a1657_hi: + // 12 13 28 29 44 45 60 61 + // 14 15 30 31 46 47 62 63 + const int32x4x2_t a0246_lo = + vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo)); + const int32x4x2_t a1357_lo = + vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo)); + const int32x4x2_t a0246_hi = + vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi)); + const int32x4x2_t a1357_hi = + vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi)); + + // Swap 16 bit elements resulting in: + // b0: + // 00 08 16 24 32 40 48 56 + // 01 09 17 25 33 41 49 57 + // b1: + // 02 10 18 26 34 42 50 58 + // 03 11 19 27 35 43 51 59 + // b2: + // 04 12 20 28 36 44 52 60 + // 05 13 21 29 37 45 53 61 + // b3: + // 06 14 22 30 38 46 54 62 + // 07 15 23 31 39 47 55 63 + const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]), + vreinterpretq_s16_s32(a1357_lo.val[0])); + const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]), + vreinterpretq_s16_s32(a1357_lo.val[1])); + const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]), + vreinterpretq_s16_s32(a1357_hi.val[0])); + const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]), + vreinterpretq_s16_s32(a1357_hi.val[1])); + + *a0 = b0.val[0]; + *a1 = b0.val[1]; + *a2 = b1.val[0]; + *a3 = b1.val[1]; + *a4 = b2.val[0]; + *a5 = b2.val[1]; + *a6 = b3.val[0]; + *a7 = b3.val[1]; +} + +void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int16x8_t a0 = vld1q_s16(src_diff); + int16x8_t a1 = vld1q_s16(src_diff + src_stride); + int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + // Skip the second transpose because it is not required. + + vst1q_s16(coeff + 0, a0); + vst1q_s16(coeff + 8, a1); + vst1q_s16(coeff + 16, a2); + vst1q_s16(coeff + 24, a3); + vst1q_s16(coeff + 32, a4); + vst1q_s16(coeff + 40, a5); + vst1q_s16(coeff + 48, a6); + vst1q_s16(coeff + 56, a7); +} + +void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int i; + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + for (i = 0; i < 64; i += 8) { + const int16x8_t a0 = vld1q_s16(coeff + 0); + const int16x8_t a1 = vld1q_s16(coeff + 64); + const int16x8_t a2 = vld1q_s16(coeff + 128); + const int16x8_t a3 = vld1q_s16(coeff + 192); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + vst1q_s16(coeff + 0, c0); + vst1q_s16(coeff + 64, c1); + vst1q_s16(coeff + 128, c2); + vst1q_s16(coeff + 192, c3); + + coeff += 8; + } +} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm new file mode 100644 index 000000000..d01c4bc03 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm @@ -0,0 +1,201 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct16x16_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct16x16_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asr r0, r0, #6 ; >> 6 + + vdup.s16 q0, r0 ; duplicate a1 + mov r0, #8 + sub r2, #8 + + ; load destination data row0 - row3 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row4 - row7 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row8 - row11 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row12 - row15 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |aom_idct16x16_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c new file mode 100644 index 000000000..196b2a890 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, j, a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + for (d1 = d2 = dest, i = 0; i < 4; i++) { + for (j = 0; j < 2; j++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm new file mode 100644 index 000000000..4a8f8f183 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm @@ -0,0 +1,1182 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct16x16_256_add_neon_pass1| + EXPORT |aom_idct16x16_256_add_neon_pass2| + EXPORT |aom_idct16x16_10_add_neon_pass1| + EXPORT |aom_idct16x16_10_add_neon_pass2| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_256_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64 = 3196 + mov r3, #0xc00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r12, #0x3e00 + add r12, #0xc5 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r12 ; duplicate cospi_4_64 + + ; preloading to avoid stall + ; generate cospi_12_64 = 13623 + mov r3, #0x3500 + add r3, #0x37 + + ; generate cospi_20_64 = 9102 + mov r12, #0x2300 + add r12, #0x8e + + ; step2[4] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; step2[4] * cospi_4_64 + vmull.s16 q5, d18, d1 + vmull.s16 q6, d19, d1 + + ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 + vmlal.s16 q5, d30, d0 + vmlal.s16 q6, d31, d0 + + vdup.16 d2, r3 ; duplicate cospi_12_64 + vdup.16 d3, r12 ; duplicate cospi_20_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q5, #14 ; >> 14 + vqrshrn.s32 d15, q6, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; generate cospi_24_64 = 6270 + mov r12, #0x1800 + add r12, #0x7e + + ; step2[5] * cospi_12_64 + vmull.s16 q2, d26, d2 + vmull.s16 q3, d27, d2 + + ; step2[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q15, d27, d3 + + ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q2, d22, d3 + vmlsl.s16 q3, d23, d3 + + ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q15, d23, d2 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q2, #14 ; >> 14 + vqrshrn.s32 d11, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q15, #14 ; >> 14 + + ; stage 4 + vdup.16 d30, r3 ; cospi_16_64 + + ; step1[0] * cospi_16_64 + vmull.s16 q2, d16, d30 + vmull.s16 q11, d17, d30 + + ; step1[1] * cospi_16_64 + vmull.s16 q0, d24, d30 + vmull.s16 q1, d25, d30 + + ; generate cospi_8_64 = 15137 + mov r3, #0x3b00 + add r3, #0x21 + + vdup.16 d30, r12 ; duplicate cospi_24_64 + vdup.16 d31, r3 ; duplicate cospi_8_64 + + ; temp1 = (step1[0] + step1[1]) * cospi_16_64 + vadd.s32 q3, q2, q0 + vadd.s32 q12, q11, q1 + + ; temp2 = (step1[0] - step1[1]) * cospi_16_64 + vsub.s32 q13, q2, q0 + vsub.s32 q1, q11, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d16, q3, #14 ; >> 14 + vqrshrn.s32 d17, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d18, q13, #14 ; >> 14 + vqrshrn.s32 d19, q1, #14 ; >> 14 + + ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + ; step1[2] * cospi_8_64 + vmull.s16 q0, d20, d31 + vmull.s16 q1, d21, d31 + + ; step1[2] * cospi_24_64 + vmull.s16 q12, d20, d30 + vmull.s16 q13, d21, d30 + + ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q0, d28, d30 + vmlal.s16 q1, d29, d30 + + ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q12, d28, d31 + vmlsl.s16 q13, d29, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d22, q0, #14 ; >> 14 + vqrshrn.s32 d23, q1, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d20, q12, #14 ; >> 14 + vqrshrn.s32 d21, q13, #14 ; >> 14 + + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; + vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; stage 5 + vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; + vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; + vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; + vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; + + vdup.16 d16, r3; ; duplicate cospi_16_64 + + ; step2[5] * cospi_16_64 + vmull.s16 q11, d26, d16 + vmull.s16 q12, d27, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q6, q9, q11 + vsub.s32 q13, q10, q12 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q6, #14 ; >> 14 + vqrshrn.s32 d11, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d16}, [r1], r2 + vst1.64 {d17}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |aom_idct16x16_256_add_neon_pass1| + +;void aom_idct16x16_256_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_256_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate cospi_30_64 = 1606 + mov r3, #0x0600 + add r3, #0x46 + + ; generate cospi_2_64 = 16305 + mov r12, #0x3f00 + add r12, #0xb1 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d12, r3 ; duplicate cospi_30_64 + vdup.16 d13, r12 ; duplicate cospi_2_64 + + ; preloading to avoid stall + ; generate cospi_14_64 = 12665 + mov r3, #0x3100 + add r3, #0x79 + + ; generate cospi_18_64 = 10394 + mov r12, #0x2800 + add r12, #0x9a + + ; step1[8] * cospi_30_64 + vmull.s16 q2, d16, d12 + vmull.s16 q3, d17, d12 + + ; step1[8] * cospi_2_64 + vmull.s16 q1, d16, d13 + vmull.s16 q4, d17, d13 + + ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 + vmlsl.s16 q2, d30, d13 + vmlsl.s16 q3, d31, d13 + + ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 + vmlal.s16 q1, d30, d12 + vmlal.s16 q4, d31, d12 + + vdup.16 d30, r3 ; duplicate cospi_14_64 + vdup.16 d31, r12 ; duplicate cospi_18_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d0, q2, #14 ; >> 14 + vqrshrn.s32 d1, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q1, #14 ; >> 14 + vqrshrn.s32 d15, q4, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_22_64 = 7723 + mov r3, #0x1e00 + add r3, #0x2b + + ; generate cospi_10_64 = 14449 + mov r12, #0x3800 + add r12, #0x71 + + ; step1[9] * cospi_14_64 + vmull.s16 q2, d24, d30 + vmull.s16 q3, d25, d30 + + ; step1[9] * cospi_18_64 + vmull.s16 q4, d24, d31 + vmull.s16 q5, d25, d31 + + ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 + vmlsl.s16 q2, d22, d31 + vmlsl.s16 q3, d23, d31 + + ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 + vmlal.s16 q4, d22, d30 + vmlal.s16 q5, d23, d30 + + vdup.16 d30, r3 ; duplicate cospi_22_64 + vdup.16 d31, r12 ; duplicate cospi_10_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q2, #14 ; >> 14 + vqrshrn.s32 d3, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q4, #14 ; >> 14 + vqrshrn.s32 d13, q5, #14 ; >> 14 + + ; step1[10] * cospi_22_64 + vmull.s16 q11, d20, d30 + vmull.s16 q12, d21, d30 + + ; step1[10] * cospi_10_64 + vmull.s16 q4, d20, d31 + vmull.s16 q5, d21, d31 + + ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 + vmlsl.s16 q11, d26, d31 + vmlsl.s16 q12, d27, d31 + + ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 + vmlal.s16 q4, d26, d30 + vmlal.s16 q5, d27, d30 + + ; preloading to avoid stall + ; generate cospi_6_64 = 15679 + mov r3, #0x3d00 + add r3, #0x3f + + ; generate cospi_26_64 = 4756 + mov r12, #0x1200 + add r12, #0x94 + + vdup.16 d30, r3 ; duplicate cospi_6_64 + vdup.16 d31, r12 ; duplicate cospi_26_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d11, q5, #14 ; >> 14 + vqrshrn.s32 d10, q4, #14 ; >> 14 + + ; step1[11] * cospi_6_64 + vmull.s16 q10, d28, d30 + vmull.s16 q11, d29, d30 + + ; step1[11] * cospi_26_64 + vmull.s16 q12, d28, d31 + vmull.s16 q13, d29, d31 + + ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 + vmlsl.s16 q10, d18, d31 + vmlsl.s16 q11, d19, d31 + + ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 + vmlal.s16 q12, d18, d30 + vmlal.s16 q13, d19, d30 + + vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] + vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q11, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q12, #14 ; >> 14 + vqrshrn.s32 d9, q13, #14 ; >> 14 + + ; stage 3 + vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] + vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] + vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] + vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] + vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] + vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + + ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vdup.16 d30, r12 ; duplicate cospi_8_64 + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d18, d31 + vmull.s16 q3, d19, d31 + + ; step1[14] * cospi_24_64 + vmull.s16 q4, d28, d31 + vmull.s16 q5, d29, d31 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d28, d30 + vmlal.s16 q3, d29, d30 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q4, d18, d30 + vmlsl.s16 q5, d19, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q4, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + vmov.s16 q3, q11 + vmov.s16 q4, q12 + + ; - step1[13] * cospi_8_64 + vmull.s16 q11, d26, d30 + vmull.s16 q12, d27, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d20, d30 + vmull.s16 q9, d21, d30 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlsl.s16 q11, d20, d31 + vmlsl.s16 q12, d21, d31 + + ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d26, d31 + vmlal.s16 q9, d27, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q10, q3, q0 + vadd.s32 q4, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q10, #14 ; >> 14 + vqrshrn.s32 d11, q4, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + cmp r3, #0 ; check if need adding dest data + beq skip_adding_dest + + ldr r7, [sp, #28] ; dest used to save element 0-7 + mov r9, r7 ; save dest pointer for later use + ldr r8, [sp, #32] ; load dest_stride + + ; stage 7 + ; load the data in pass1 + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + + ; store the data output 8,9,10,11,12,13,14,15 + vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q8 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q9, q9, #6 + vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q9 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q2, q2, #6 + vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q2 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q3, q3, #6 + vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q3 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q4, q4, #6 + vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q4 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q5, q5, #6 + vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q5 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q14, q14, #6 + vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q14 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q15, q15, #6 + vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q15 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + b end_idct16x16_pass2 + +skip_adding_dest + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |aom_idct16x16_256_add_neon_pass2| + +;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_10_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64*2 = 6392 + mov r3, #0x1800 + add r3, #0xf8 + + ; generate cospi_4_64*2 = 32138 + mov r12, #0x7d00 + add r12, #0x8a + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q0, r3 ; duplicate cospi_28_64*2 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, + ; double, and return the high 16 bits, effectively giving >> 15. Doubling + ; the constant will change this to >> 14. + ; dct_const_round_shift(step2[4] * cospi_28_64); + vqrdmulh.s16 q4, q9, q0 + + ; preloading to avoid stall + ; generate cospi_16_64*2 = 23170 + mov r3, #0x5a00 + add r3, #0x82 + + ; dct_const_round_shift(step2[4] * cospi_4_64); + vqrdmulh.s16 q7, q9, q1 + + ; stage 4 + vdup.16 q1, r3 ; cospi_16_64*2 + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + vdup.16 d4, r3; ; duplicate cospi_16_64 + + ; dct_const_round_shift(step1[0] * cospi_16_64) + vqrdmulh.s16 q8, q8, q1 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d14, d4 + vmull.s16 q10, d15, d4 + + ; step2[5] * cospi_16_64 + vmull.s16 q12, d9, d4 + vmull.s16 q11, d8, d4 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q15, q10, q12 + vsub.s32 q6, q9, q11 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d11, q15, #14 ; >> 14 + vqrshrn.s32 d10, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; + vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; + vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d4}, [r1], r2 + vst1.64 {d5}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |aom_idct16x16_10_add_neon_pass1| + +;void aom_idct16x16_10_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_10_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate 2*cospi_30_64 = 3212 + mov r3, #0xc00 + add r3, #0x8c + + ; generate 2*cospi_2_64 = 32610 + mov r12, #0x7f00 + add r12, #0x62 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q6, r3 ; duplicate 2*cospi_30_64 + + ; dct_const_round_shift(step1[8] * cospi_30_64) + vqrdmulh.s16 q0, q8, q6 + + vdup.16 q6, r12 ; duplicate 2*cospi_2_64 + + ; dct_const_round_shift(step1[8] * cospi_2_64) + vqrdmulh.s16 q7, q8, q6 + + ; preloading to avoid stall + ; generate 2*cospi_26_64 = 9512 + mov r12, #0x2500 + add r12, #0x28 + rsb r12, #0 + vdup.16 q15, r12 ; duplicate -2*cospi_26_64 + + ; generate 2*cospi_6_64 = 31358 + mov r3, #0x7a00 + add r3, #0x7e + vdup.16 q14, r3 ; duplicate 2*cospi_6_64 + + ; dct_const_round_shift(- step1[12] * cospi_26_64) + vqrdmulh.s16 q3, q9, q15 + + ; dct_const_round_shift(step1[12] * cospi_6_64) + vqrdmulh.s16 q4, q9, q14 + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + vdup.16 d30, r12 ; duplicate cospi_8_64 + + ; step1[14] * cospi_24_64 + vmull.s16 q12, d14, d31 + vmull.s16 q5, d15, d31 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d0, d31 + vmull.s16 q11, d1, d31 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q12, d0, d30 + vmlsl.s16 q5, d1, d30 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d14, d30 + vmlal.s16 q11, d15, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q12, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q11, #14 ; >> 14 + + ; - step1[13] * cospi_8_64 + vmull.s16 q10, d8, d30 + vmull.s16 q13, d9, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d6, d30 + vmull.s16 q9, d7, d30 + + ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 + vmlsl.s16 q10, d6, d31 + vmlsl.s16 q13, d7, d31 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d8, d31 + vmlal.s16 q9, d9, d31 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q10, #14 ; >> 14 + vqrshrn.s32 d5, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q0, q3, q0 + vadd.s32 q1, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q0, #14 ; >> 14 + vqrshrn.s32 d11, q1, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct10_16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |aom_idct16x16_10_add_neon_pass2| + END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c new file mode 100644 index 000000000..b4cb7a0cd --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c @@ -0,0 +1,1295 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, + int output_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d18s16, d1s16); + q6s32 = vmull_s16(d19s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlal_s16(q5s32, d30s16, d0s16); + q6s32 = vmlal_s16(q6s32, d31s16, d0s16); + + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q5s32, 14); + d15s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + q2s32 = vmull_s16(d26s16, d2s16); + q3s32 = vmull_s16(d27s16, d2s16); + q9s32 = vmull_s16(d26s16, d3s16); + q15s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q15s32 = vmlal_s16(q15s32, d23s16, d2s16); + + d10s16 = vqrshrn_n_s32(q2s32, 14); + d11s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q15s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + d30s16 = vdup_n_s16((int16_t)cospi_16_64); + + q2s32 = vmull_s16(d16s16, d30s16); + q11s32 = vmull_s16(d17s16, d30s16); + q0s32 = vmull_s16(d24s16, d30s16); + q1s32 = vmull_s16(d25s16, d30s16); + + d30s16 = vdup_n_s16((int16_t)cospi_24_64); + d31s16 = vdup_n_s16((int16_t)cospi_8_64); + + q3s32 = vaddq_s32(q2s32, q0s32); + q12s32 = vaddq_s32(q11s32, q1s32); + q13s32 = vsubq_s32(q2s32, q0s32); + q1s32 = vsubq_s32(q11s32, q1s32); + + d16s16 = vqrshrn_n_s32(q3s32, 14); + d17s16 = vqrshrn_n_s32(q12s32, 14); + d18s16 = vqrshrn_n_s32(q13s32, 14); + d19s16 = vqrshrn_n_s32(q1s32, 14); + q8s16 = vcombine_s16(d16s16, d17s16); + q9s16 = vcombine_s16(d18s16, d19s16); + + q0s32 = vmull_s16(d20s16, d31s16); + q1s32 = vmull_s16(d21s16, d31s16); + q12s32 = vmull_s16(d20s16, d30s16); + q13s32 = vmull_s16(d21s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d30s16); + q1s32 = vmlal_s16(q1s32, d29s16, d30s16); + q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); + + d22s16 = vqrshrn_n_s32(q0s32, 14); + d23s16 = vqrshrn_n_s32(q1s32, 14); + d20s16 = vqrshrn_n_s32(q12s32, 14); + d21s16 = vqrshrn_n_s32(q13s32, 14); + q10s16 = vcombine_s16(d20s16, d21s16); + q11s16 = vcombine_s16(d22s16, d23s16); + + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q15s16 = vaddq_s16(q6s16, q7s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + // stage 5 + q0s16 = vaddq_s16(q8s16, q11s16); + q1s16 = vaddq_s16(q9s16, q10s16); + q2s16 = vsubq_s16(q9s16, q10s16); + q3s16 = vsubq_s16(q8s16, q11s16); + + d16s16 = vdup_n_s16((int16_t)cospi_16_64); + + q11s32 = vmull_s16(d26s16, d16s16); + q12s32 = vmull_s16(d27s16, d16s16); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + + q6s32 = vsubq_s32(q9s32, q11s32); + q13s32 = vsubq_s32(q10s32, q12s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d10s16 = vqrshrn_n_s32(q6s32, 14); + d11s16 = vqrshrn_n_s32(q13s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q8s16 = vaddq_s16(q0s16, q15s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d16u64); + out += output_stride; + vst1_u64((uint64_t *)out, d17u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride) { + uint8_t *d; + uint8x8_t d12u8, d13u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d24u64, d25u64, d26u64, d27u64; + int64x1_t d12s64, d13s64; + uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; + uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d12s16 = vdup_n_s16((int16_t)cospi_30_64); + d13s16 = vdup_n_s16((int16_t)cospi_2_64); + + q2s32 = vmull_s16(d16s16, d12s16); + q3s32 = vmull_s16(d17s16, d12s16); + q1s32 = vmull_s16(d16s16, d13s16); + q4s32 = vmull_s16(d17s16, d13s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); + q1s32 = vmlal_s16(q1s32, d30s16, d12s16); + q4s32 = vmlal_s16(q4s32, d31s16, d12s16); + + d0s16 = vqrshrn_n_s32(q2s32, 14); + d1s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q1s32, 14); + d15s16 = vqrshrn_n_s32(q4s32, 14); + q0s16 = vcombine_s16(d0s16, d1s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d30s16 = vdup_n_s16((int16_t)cospi_14_64); + d31s16 = vdup_n_s16((int16_t)cospi_18_64); + + q2s32 = vmull_s16(d24s16, d30s16); + q3s32 = vmull_s16(d25s16, d30s16); + q4s32 = vmull_s16(d24s16, d31s16); + q5s32 = vmull_s16(d25s16, d31s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); + q4s32 = vmlal_s16(q4s32, d22s16, d30s16); + q5s32 = vmlal_s16(q5s32, d23s16, d30s16); + + d2s16 = vqrshrn_n_s32(q2s32, 14); + d3s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q4s32, 14); + d13s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16((int16_t)cospi_22_64); + d31s16 = vdup_n_s16((int16_t)cospi_10_64); + + q11s32 = vmull_s16(d20s16, d30s16); + q12s32 = vmull_s16(d21s16, d30s16); + q4s32 = vmull_s16(d20s16, d31s16); + q5s32 = vmull_s16(d21s16, d31s16); + + q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); + q4s32 = vmlal_s16(q4s32, d26s16, d30s16); + q5s32 = vmlal_s16(q5s32, d27s16, d30s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d11s16 = vqrshrn_n_s32(q5s32, 14); + d10s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + d30s16 = vdup_n_s16((int16_t)cospi_6_64); + d31s16 = vdup_n_s16((int16_t)cospi_26_64); + + q10s32 = vmull_s16(d28s16, d30s16); + q11s32 = vmull_s16(d29s16, d30s16); + q12s32 = vmull_s16(d28s16, d31s16); + q13s32 = vmull_s16(d29s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); + q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); + q12s32 = vmlal_s16(q12s32, d18s16, d30s16); + q13s32 = vmlal_s16(q13s32, d19s16, d30s16); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q11s32, 14); + d8s16 = vqrshrn_n_s32(q12s32, 14); + d9s16 = vqrshrn_n_s32(q13s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 3 + q9s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q10s16 = vsubq_s16(q3s16, q2s16); + q11s16 = vaddq_s16(q2s16, q3s16); + q12s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q6s16, q7s16); + + // stage 4 + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); + + q2s32 = vmull_s16(d18s16, d31s16); + q3s32 = vmull_s16(d19s16, d31s16); + q4s32 = vmull_s16(d28s16, d31s16); + q5s32 = vmull_s16(d29s16, d31s16); + + q2s32 = vmlal_s16(q2s32, d28s16, d30s16); + q3s32 = vmlal_s16(q3s32, d29s16, d30s16); + q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); + + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q3s32, 14); + d2s16 = vqrshrn_n_s32(q4s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q3s16 = q11s16; + q4s16 = q12s16; + + d30s16 = vdup_n_s16(-cospi_8_64); + q11s32 = vmull_s16(d26s16, d30s16); + q12s32 = vmull_s16(d27s16, d30s16); + q8s32 = vmull_s16(d20s16, d30s16); + q9s32 = vmull_s16(d21s16, d30s16); + + q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); + q8s32 = vmlal_s16(q8s32, d26s16, d31s16); + q9s32 = vmlal_s16(q9s32, d27s16, d31s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16((int16_t)cospi_16_64); + + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q10s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q10s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + if (skip_adding != 0) { + d = dest; + // load the data in pass1 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + // store the data out 8,9,10,11,12,13,14,15 + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q8s16 = vrshrq_n_s16(q8s16, 6); + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q9s16 = vrshrq_n_s16(q9s16, 6); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q2s16 = vrshrq_n_s16(q2s16, 6); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q3s16 = vrshrq_n_s16(q3s16, 6); + q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q4s16 = vrshrq_n_s16(q4s16, 6); + q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q5s16 = vrshrq_n_s16(q5s16, 6); + q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q14s16 = vrshrq_n_s16(q14s16, 6); + q14u16 = + vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + q15s16 = vrshrq_n_s16(q15s16, 6); + q15u16 = + vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + } else { // skip_adding_dest + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + } + return; +} + +void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, + int output_stride) { + int16x4_t d4s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // stage 3 + q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2)); + q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2)); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + // stage 4 + q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2)); + d4s16 = vdup_n_s16((int16_t)cospi_16_64); + + q8s16 = vqrdmulhq_s16(q8s16, q1s16); + + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + q9s32 = vmull_s16(d14s16, d4s16); + q10s32 = vmull_s16(d15s16, d4s16); + q12s32 = vmull_s16(d9s16, d4s16); + q11s32 = vmull_s16(d8s16, d4s16); + + q15s32 = vsubq_s32(q10s32, q12s32); + q6s32 = vsubq_s32(q9s32, q11s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d11s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q6s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q2s16 = vaddq_s16(q8s16, q7s16); + q9s16 = vaddq_s16(q8s16, q6s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q8s16, q4s16); + q12s16 = vsubq_s16(q8s16, q4s16); + q13s16 = vsubq_s16(q8s16, q5s16); + q14s16 = vsubq_s16(q8s16, q6s16); + q15s16 = vsubq_s16(q8s16, q7s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d4u64); + out += output_stride; + vst1_u64((uint64_t *)out, d5u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; + uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; + uint64x1_t d16u64, d17u64, d18u64, d19u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + (void)skip_adding; + (void)dest; + (void)dest_stride; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // stage 3 + q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2)); + q0s16 = vqrdmulhq_s16(q8s16, q6s16); + q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2)); + q7s16 = vqrdmulhq_s16(q8s16, q6s16); + + q15s16 = vdupq_n_s16(-cospi_26_64 * 2); + q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2)); + q3s16 = vqrdmulhq_s16(q9s16, q15s16); + q4s16 = vqrdmulhq_s16(q9s16, q14s16); + + // stage 4 + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d6s16 = vget_low_s16(q3s16); + d7s16 = vget_high_s16(q3s16); + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); + + q12s32 = vmull_s16(d14s16, d31s16); + q5s32 = vmull_s16(d15s16, d31s16); + q2s32 = vmull_s16(d0s16, d31s16); + q11s32 = vmull_s16(d1s16, d31s16); + + q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); + q2s32 = vmlal_s16(q2s32, d14s16, d30s16); + q11s32 = vmlal_s16(q11s32, d15s16, d30s16); + + d2s16 = vqrshrn_n_s32(q12s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q11s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(-cospi_8_64); + q10s32 = vmull_s16(d8s16, d30s16); + q13s32 = vmull_s16(d9s16, d30s16); + q8s32 = vmull_s16(d6s16, d30s16); + q9s32 = vmull_s16(d7s16, d30s16); + + q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); + q8s32 = vmlal_s16(q8s32, d8s16, d31s16); + q9s32 = vmlal_s16(q9s32, d9s16, d31s16); + + d4s16 = vqrshrn_n_s32(q10s32, 14); + d5s16 = vqrshrn_n_s32(q13s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16((int16_t)cospi_16_64); + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q0s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q0s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); + d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); + d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); + d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); + d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); + d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + vst1_u64((uint64_t *)out, d16u64); + out += 4; + vst1_u64((uint64_t *)out, d17u64); + out += 12; + vst1_u64((uint64_t *)out, d18u64); + out += 4; + vst1_u64((uint64_t *)out, d19u64); + out += 12; + vst1_u64((uint64_t *)out, d4u64); + out += 4; + vst1_u64((uint64_t *)out, d5u64); + out += 12; + vst1_u64((uint64_t *)out, d6u64); + out += 4; + vst1_u64((uint64_t *)out, d7u64); + out += 12; + vst1_u64((uint64_t *)out, d8u64); + out += 4; + vst1_u64((uint64_t *)out, d9u64); + out += 12; + vst1_u64((uint64_t *)out, d10u64); + out += 4; + vst1_u64((uint64_t *)out, d11u64); + out += 12; + vst1_u64((uint64_t *)out, d28u64); + out += 4; + vst1_u64((uint64_t *)out, d29u64); + out += 12; + vst1_u64((uint64_t *)out, d30u64); + out += 4; + vst1_u64((uint64_t *)out, d31u64); + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_neon.c new file mode 100644 index 000000000..db0d4905b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_neon.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_dsp_common.h" + +void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, + int output_stride); +void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride); +void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, + int output_stride); +void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride); + +#if HAVE_NEON_ASM +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void aom_push_neon(int64_t *store); +extern void aom_pop_neon(int64_t *store); +#endif // HAVE_NEON_ASM + +void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, + int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16 * 16] = { 0 }; + int16_t row_idct_output[16 * 16] = { 0 }; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + aom_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, + dest, dest_stride); + + /* Parallel idct on the lower 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, + pass1_output, 0, dest, dest_stride); + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, + pass1_output, 1, dest, dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, + row_idct_output + 8, pass1_output, 1, + dest + 8, dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + aom_pop_neon(store_reg); +#endif + + return; +} + +void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, + int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16 * 16] = { 0 }; + int16_t row_idct_output[16 * 16] = { 0 }; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + aom_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, + dest, dest_stride); + + /* Skip Parallel idct on the lower 8 rows as they are all 0s */ + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, + pass1_output, 1, dest, dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, + row_idct_output + 8, pass1_output, 1, + dest + 8, dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + aom_pop_neon(store_reg); +#endif + + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm new file mode 100644 index 000000000..b04df2d0b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm @@ -0,0 +1,147 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + EXPORT |aom_idct32x32_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ;TODO(hkuang): put the following macros in a seperate + ;file so other idct function could also use them. + MACRO + LD_16x8 $src, $stride + vld1.8 {q8}, [$src], $stride + vld1.8 {q9}, [$src], $stride + vld1.8 {q10}, [$src], $stride + vld1.8 {q11}, [$src], $stride + vld1.8 {q12}, [$src], $stride + vld1.8 {q13}, [$src], $stride + vld1.8 {q14}, [$src], $stride + vld1.8 {q15}, [$src], $stride + MEND + + MACRO + ADD_DIFF_16x8 $diff + vqadd.u8 q8, q8, $diff + vqadd.u8 q9, q9, $diff + vqadd.u8 q10, q10, $diff + vqadd.u8 q11, q11, $diff + vqadd.u8 q12, q12, $diff + vqadd.u8 q13, q13, $diff + vqadd.u8 q14, q14, $diff + vqadd.u8 q15, q15, $diff + MEND + + MACRO + SUB_DIFF_16x8 $diff + vqsub.u8 q8, q8, $diff + vqsub.u8 q9, q9, $diff + vqsub.u8 q10, q10, $diff + vqsub.u8 q11, q11, $diff + vqsub.u8 q12, q12, $diff + vqsub.u8 q13, q13, $diff + vqsub.u8 q14, q14, $diff + vqsub.u8 q15, q15, $diff + MEND + + MACRO + ST_16x8 $dst, $stride + vst1.8 {q8}, [$dst], $stride + vst1.8 {q9}, [$dst], $stride + vst1.8 {q10},[$dst], $stride + vst1.8 {q11},[$dst], $stride + vst1.8 {q12},[$dst], $stride + vst1.8 {q13},[$dst], $stride + vst1.8 {q14},[$dst], $stride + vst1.8 {q15},[$dst], $stride + MEND + +;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride + +|aom_idct32x32_1_add_neon| PROC + push {lr} + pld [r1] + add r3, r1, #16 ; r3 dest + 16 for second loop + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asrs r0, r0, #6 ; >> 6 + bge diff_positive_32_32 + +diff_negative_32_32 + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_negative_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_negative_32_32_loop + pop {pc} + +diff_positive_32_32 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_positive_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_positive_32_32_loop + pop {pc} + + ENDP ; |aom_idct32x32_1_add_neon| + END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c new file mode 100644 index 000000000..547567c5b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vld1q_u8(d); + d += d_stride; + *q9u8 = vld1q_u8(d); + d += d_stride; + *q10u8 = vld1q_u8(d); + d += d_stride; + *q11u8 = vld1q_u8(d); + d += d_stride; + *q12u8 = vld1q_u8(d); + d += d_stride; + *q13u8 = vld1q_u8(d); + d += d_stride; + *q14u8 = vld1q_u8(d); + d += d_stride; + *q15u8 = vld1q_u8(d); + return; +} + +static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqaddq_u8(*q8u8, qdiffu8); + *q9u8 = vqaddq_u8(*q9u8, qdiffu8); + *q10u8 = vqaddq_u8(*q10u8, qdiffu8); + *q11u8 = vqaddq_u8(*q11u8, qdiffu8); + *q12u8 = vqaddq_u8(*q12u8, qdiffu8); + *q13u8 = vqaddq_u8(*q13u8, qdiffu8); + *q14u8 = vqaddq_u8(*q14u8, qdiffu8); + *q15u8 = vqaddq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqsubq_u8(*q8u8, qdiffu8); + *q9u8 = vqsubq_u8(*q9u8, qdiffu8); + *q10u8 = vqsubq_u8(*q10u8, qdiffu8); + *q11u8 = vqsubq_u8(*q11u8, qdiffu8); + *q12u8 = vqsubq_u8(*q12u8, qdiffu8); + *q13u8 = vqsubq_u8(*q13u8, qdiffu8); + *q14u8 = vqsubq_u8(*q14u8, qdiffu8); + *q15u8 = vqsubq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + vst1q_u8(d, *q8u8); + d += d_stride; + vst1q_u8(d, *q9u8); + d += d_stride; + vst1q_u8(d, *q10u8); + d += d_stride; + vst1q_u8(d, *q11u8); + d += d_stride; + vst1q_u8(d, *q12u8); + d += d_stride; + vst1q_u8(d, *q13u8); + d += d_stride; + vst1q_u8(d, *q14u8); + d += d_stride; + vst1q_u8(d, *q15u8); + return; +} + +void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int i, j, dest_stride8; + uint8_t *d; + int16_t a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + dest_stride8 = dest_stride * 8; + if (a1 >= 0) { // diff_positive_32_32 + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + d += dest_stride8; + } + } + } else { // diff_negative_32_32 + a1 = -a1; + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + d += dest_stride8; + } + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm new file mode 100644 index 000000000..e7793fb16 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm @@ -0,0 +1,1302 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +;TODO(cd): adjust these constant to be able to use vqdmulh for faster +; dct_const_round_shift(a * b) within butterfly calculations. +cospi_1_64 EQU 16364 +cospi_2_64 EQU 16305 +cospi_3_64 EQU 16207 +cospi_4_64 EQU 16069 +cospi_5_64 EQU 15893 +cospi_6_64 EQU 15679 +cospi_7_64 EQU 15426 +cospi_8_64 EQU 15137 +cospi_9_64 EQU 14811 +cospi_10_64 EQU 14449 +cospi_11_64 EQU 14053 +cospi_12_64 EQU 13623 +cospi_13_64 EQU 13160 +cospi_14_64 EQU 12665 +cospi_15_64 EQU 12140 +cospi_16_64 EQU 11585 +cospi_17_64 EQU 11003 +cospi_18_64 EQU 10394 +cospi_19_64 EQU 9760 +cospi_20_64 EQU 9102 +cospi_21_64 EQU 8423 +cospi_22_64 EQU 7723 +cospi_23_64 EQU 7005 +cospi_24_64 EQU 6270 +cospi_25_64 EQU 5520 +cospi_26_64 EQU 4756 +cospi_27_64 EQU 3981 +cospi_28_64 EQU 3196 +cospi_29_64 EQU 2404 +cospi_30_64 EQU 1606 +cospi_31_64 EQU 804 + + + EXPORT |aom_idct32x32_1024_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY + + ; -------------------------------------------------------------------------- + ; Load from transposed_buffer + ; q13 = transposed_buffer[first_offset] + ; q14 = transposed_buffer[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; transposed_buffer must be passed in. use 0 for first use. + MACRO + LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset + ; address calculation with proper stride and loading + add r0, #($first_offset - $prev_offset )*8*2 + vld1.s16 {q14}, [r0] + add r0, #($second_offset - $first_offset)*8*2 + vld1.s16 {q13}, [r0] + ; (used) two registers (q14, q13) + MEND + ; -------------------------------------------------------------------------- + ; Load from output (used as temporary storage) + ; reg1 = output[first_offset] + ; reg2 = output[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and loading + add r1, #($first_offset - $prev_offset )*32*2 + vld1.s16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vld1.s16 {$reg2}, [r1] + ; (used) two registers ($reg1, $reg2) + MEND + ; -------------------------------------------------------------------------- + ; Store into output (sometimes as as temporary storage) + ; output[first_offset] = reg1 + ; output[second_offset] = reg2 + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and storing + add r1, #($first_offset - $prev_offset )*32*2 + vst1.16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vst1.16 {$reg2}, [r1] + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10] + vst1.16 {d11}, [r9] + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10]! + vst1.16 {d11}, [r9]! + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6] + vst1.16 {d4}, [r7] + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6]! + vst1.16 {d4}, [r7]! + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + ; TODO(cd): have special case to re-use constants when they are similar for + ; consecutive butterflies + ; TODO(cd): have special case when both constants are the same, do the + ; additions/subtractions before the multiplies. + ; generate the constants + ; generate scalar constants + mov r8, #$first_constant & 0xFF00 + mov r12, #$second_constant & 0xFF00 + add r8, #$first_constant & 0x00FF + add r12, #$second_constant & 0x00FF + ; generate vector constants + vdup.16 d30, r8 + vdup.16 d31, r12 + ; (used) two for inputs (regA-regD), one for constants (q15) + ; do some multiplications (ordered for maximum latency hiding) + vmull.s16 q8, $regC, d30 + vmull.s16 q10, $regA, d31 + vmull.s16 q9, $regD, d30 + vmull.s16 q11, $regB, d31 + vmull.s16 q12, $regC, d31 + ; (used) five for intermediate (q8-q12), one for constants (q15) + ; do some addition/subtractions (to get back two register) + vsub.s32 q8, q8, q10 + vsub.s32 q9, q9, q11 + ; do more multiplications (ordered for maximum latency hiding) + vmull.s16 q10, $regD, d31 + vmull.s16 q11, $regA, d30 + vmull.s16 q15, $regB, d30 + ; (used) six for intermediate (q8-q12, q15) + ; do more addition/subtractions + vadd.s32 q11, q12, q11 + vadd.s32 q10, q10, q15 + ; (used) four for intermediate (q8-q11) + ; dct_const_round_shift + vqrshrn.s32 $reg1, q8, #14 + vqrshrn.s32 $reg2, q9, #14 + vqrshrn.s32 $reg3, q11, #14 + vqrshrn.s32 $reg4, q10, #14 + ; (used) two for results, well four d registers + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + MEND + ; -------------------------------------------------------------------------- + +;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +; +; r0 int16_t *input, +; r1 uint8_t *dest, +; r2 int dest_stride) +; loop counters +; r4 bands loop counter +; r5 pass loop counter +; r8 transpose loop counter +; combine-add pointers +; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) +; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) +; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) +; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) + +|aom_idct32x32_1024_add_neon| PROC + ; This function does one pass of idct32x32 transform. + ; + ; This is done by transposing the input and then doing a 1d transform on + ; columns. In the first pass, the transposed columns are the original + ; rows. In the second pass, after the transposition, the colums are the + ; original columns. + ; The 1d transform is done by looping over bands of eight columns (the + ; idct32_bands loop). For each band, the transform input transposition + ; is done on demand, one band of four 8x8 matrices at a time. The four + ; matrices are transposed by pairs (the idct32_transpose_pair loop). + push {r4-r11} + vpush {d8-d15} + ; stack operation + ; internal buffer used to transpose 8 lines into before transforming them + ; int16_t transpose_buffer[32 * 8]; + ; at sp + [4096, 4607] + ; results of the first pass (transpose and transform rows) + ; int16_t pass1[32 * 32]; + ; at sp + [0, 2047] + ; results of the second pass (transpose and transform columns) + ; int16_t pass2[32 * 32]; + ; at sp + [2048, 4095] + sub sp, sp, #512+2048+2048 + + ; r6 = dest + 31 * dest_stride + ; r7 = dest + 0 * dest_stride + ; r9 = dest + 15 * dest_stride + ; r10 = dest + 16 * dest_stride + rsb r6, r2, r2, lsl #5 + rsb r9, r2, r2, lsl #4 + add r10, r1, r2, lsl #4 + mov r7, r1 + add r6, r6, r1 + add r9, r9, r1 + ; r11 = -dest_stride + neg r11, r2 + ; r3 = input + mov r3, r0 + ; parameters for first pass + ; r0 = transpose_buffer[32 * 8] + add r0, sp, #4096 + ; r1 = pass1[32 * 32] + mov r1, sp + + mov r5, #0 ; initialize pass loop counter +idct32_pass_loop + mov r4, #4 ; initialize bands loop counter +idct32_bands_loop + mov r8, #2 ; initialize transpose loop counter +idct32_transpose_pair_loop + ; Load two horizontally consecutive 8x8 16bit data matrices. The first one + ; into q0-q7 and the second one into q8-q15. There is a stride of 64, + ; adjusted to 32 because of the two post-increments. + vld1.s16 {q8}, [r3]! + vld1.s16 {q0}, [r3]! + add r3, #32 + vld1.s16 {q9}, [r3]! + vld1.s16 {q1}, [r3]! + add r3, #32 + vld1.s16 {q10}, [r3]! + vld1.s16 {q2}, [r3]! + add r3, #32 + vld1.s16 {q11}, [r3]! + vld1.s16 {q3}, [r3]! + add r3, #32 + vld1.s16 {q12}, [r3]! + vld1.s16 {q4}, [r3]! + add r3, #32 + vld1.s16 {q13}, [r3]! + vld1.s16 {q5}, [r3]! + add r3, #32 + vld1.s16 {q14}, [r3]! + vld1.s16 {q6}, [r3]! + add r3, #32 + vld1.s16 {q15}, [r3]! + vld1.s16 {q7}, [r3]! + + ; Transpose the two 8x8 16bit data matrices. + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vswp d1, d8 + vswp d7, d14 + vswp d5, d12 + vswp d3, d10 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + ; Store both matrices after each other. There is a stride of 32, which + ; adjusts to nothing because of the post-increments. + vst1.16 {q8}, [r0]! + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + vst1.16 {q0}, [r0]! + vst1.16 {q1}, [r0]! + vst1.16 {q2}, [r0]! + vst1.16 {q3}, [r0]! + vst1.16 {q4}, [r0]! + vst1.16 {q5}, [r0]! + vst1.16 {q6}, [r0]! + vst1.16 {q7}, [r0]! + + ; increment pointers by adjusted stride (not necessary for r0/out) + ; go back by 7*32 for the seven lines moved fully by read and add + ; go back by 32 for the eigth line only read + ; advance by 16*2 to go the next pair + sub r3, r3, #7*32*2 + 32 - 16*2 + ; transpose pair loop processing + subs r8, r8, #1 + bne idct32_transpose_pair_loop + + ; restore r0/input to its original value + sub r0, r0, #32*8*2 + + ; Instead of doing the transforms stage by stage, it is done by loading + ; some input values and doing as many stages as possible to minimize the + ; storing/loading of intermediate results. To fit within registers, the + ; final coefficients are cut into four blocks: + ; BLOCK A: 16-19,28-31 + ; BLOCK B: 20-23,24-27 + ; BLOCK C: 8-10,11-15 + ; BLOCK D: 0-3,4-7 + ; Blocks A and C are straight calculation through the various stages. In + ; block B, further calculations are performed using the results from + ; block A. In block D, further calculations are performed using the results + ; from block C and then the final calculations are done using results from + ; block A and B which have been combined at the end of block B. + + ; -------------------------------------------------------------------------- + ; BLOCK A: 16-19,28-31 + ; -------------------------------------------------------------------------- + ; generate 16,17,30,31 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; + ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; + ;step1b[16][i] = dct_const_round_shift(temp1); + ;step1b[31][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 0, 1, 31 + DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; + ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; + ;step1b[17][i] = dct_const_round_shift(temp1); + ;step1b[30][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 31, 17, 15 + DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[16] = step1b[16][i] + step1b[17][i]; + ;step2[17] = step1b[16][i] - step1b[17][i]; + ;step2[30] = -step1b[30][i] + step1b[31][i]; + ;step2[31] = step1b[30][i] + step1b[31][i]; + vadd.s16 q4, q0, q1 + vsub.s16 q13, q0, q1 + vadd.s16 q6, q2, q3 + vsub.s16 q14, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; + ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; + ;step3[17] = dct_const_round_shift(temp1); + ;step3[30] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; generate 18,19,28,29 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; + ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; + ;step1b[18][i] = dct_const_round_shift(temp1); + ;step1b[29][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 15, 9, 23 + DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; + ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; + ;step1b[19][i] = dct_const_round_shift(temp1); + ;step1b[28][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 23, 25, 7 + DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[18] = -step1b[18][i] + step1b[19][i]; + ;step2[19] = step1b[18][i] + step1b[19][i]; + ;step2[28] = step1b[28][i] + step1b[29][i]; + ;step2[29] = step1b[28][i] - step1b[29][i]; + vsub.s16 q13, q3, q2 + vadd.s16 q3, q3, q2 + vsub.s16 q14, q1, q0 + vadd.s16 q2, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); + ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); + ;step3[29] = dct_const_round_shift(temp1); + ;step3[18] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 + ; -------------------------------------------------------------------------- + ; combine 16-19,28-31 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[16] = step1b[16][i] + step1b[19][i]; + ;step1[17] = step1b[17][i] + step1b[18][i]; + ;step1[18] = step1b[17][i] - step1b[18][i]; + ;step1[29] = step1b[30][i] - step1b[29][i]; + ;step1[30] = step1b[30][i] + step1b[29][i]; + ;step1[31] = step1b[31][i] + step1b[28][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q0 + vadd.s16 q10, q7, q1 + vadd.s16 q15, q6, q3 + vsub.s16 q13, q5, q0 + vsub.s16 q14, q7, q1 + STORE_IN_OUTPUT 0, 16, 31, q8, q15 + STORE_IN_OUTPUT 31, 17, 30, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; + ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; + ;step2[18] = dct_const_round_shift(temp1); + ;step2[29] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 + STORE_IN_OUTPUT 30, 29, 18, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[19] = step1b[16][i] - step1b[19][i]; + ;step1[28] = step1b[31][i] - step1b[28][i]; + vsub.s16 q13, q4, q2 + vsub.s16 q14, q6, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; + ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; + ;step2[19] = dct_const_round_shift(temp1); + ;step2[28] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 + STORE_IN_OUTPUT 18, 19, 28, q4, q6 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK B: 20-23,24-27 + ; -------------------------------------------------------------------------- + ; generate 20,21,26,27 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; + ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; + ;step1b[20][i] = dct_const_round_shift(temp1); + ;step1b[27][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 7, 5, 27 + DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; + ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; + ;step1b[21][i] = dct_const_round_shift(temp1); + ;step1b[26][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 27, 21, 11 + DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[20] = step1b[20][i] + step1b[21][i]; + ;step2[21] = step1b[20][i] - step1b[21][i]; + ;step2[26] = -step1b[26][i] + step1b[27][i]; + ;step2[27] = step1b[26][i] + step1b[27][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; + ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; + ;step3[21] = dct_const_round_shift(temp1); + ;step3[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 22,23,24,25 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; + ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; + ;step1b[22][i] = dct_const_round_shift(temp1); + ;step1b[25][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 11, 13, 19 + DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; + ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; + ;step1b[23][i] = dct_const_round_shift(temp1); + ;step1b[24][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 19, 29, 3 + DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[22] = -step1b[22][i] + step1b[23][i]; + ;step2[23] = step1b[22][i] + step1b[23][i]; + ;step2[24] = step1b[24][i] + step1b[25][i]; + ;step2[25] = step1b[24][i] - step1b[25][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); + ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); + ;step3[25] = dct_const_round_shift(temp1); + ;step3[22] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 20-23,24-27 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[22] = step1b[22][i] + step1b[21][i]; + ;step1[23] = step1b[23][i] + step1b[20][i]; + vadd.s16 q10, q7, q1 + vadd.s16 q11, q5, q0 + ;step1[24] = step1b[24][i] + step1b[27][i]; + ;step1[25] = step1b[25][i] + step1b[26][i]; + vadd.s16 q12, q6, q2 + vadd.s16 q15, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[16] = step1b[16][i] + step1b[23][i]; + ;step3[17] = step1b[17][i] + step1b[22][i]; + ;step3[22] = step1b[17][i] - step1b[22][i]; + ;step3[23] = step1b[16][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 + vadd.s16 q8, q14, q11 + vadd.s16 q9, q13, q10 + vsub.s16 q13, q13, q10 + vsub.s16 q11, q14, q11 + STORE_IN_OUTPUT 17, 17, 16, q9, q8 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[24] = step1b[31][i] - step1b[24][i]; + ;step3[25] = step1b[30][i] - step1b[25][i]; + ;step3[30] = step1b[30][i] + step1b[25][i]; + ;step3[31] = step1b[31][i] + step1b[24][i]; + LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 + vsub.s16 q8, q9, q12 + vadd.s16 q10, q14, q15 + vsub.s16 q14, q14, q15 + vadd.s16 q12, q9, q12 + STORE_IN_OUTPUT 31, 30, 31, q10, q12 + ; -------------------------------------------------------------------------- + ; TODO(cd) do some register allocation change to remove these push/pop + vpush {q8} ; [24] + vpush {q11} ; [23] + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; + ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; + ;step1[22] = dct_const_round_shift(temp1); + ;step1[25] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 31, 25, 22, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; + ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; + ;step1[23] = dct_const_round_shift(temp1); + ;step1[24] = dct_const_round_shift(temp2); + ; TODO(cd) do some register allocation change to remove these push/pop + vpop {q13} ; [23] + vpop {q14} ; [24] + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 22, 24, 23, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[20] = step1b[23][i] - step1b[20][i]; + ;step1[27] = step1b[24][i] - step1b[27][i]; + vsub.s16 q14, q5, q0 + vsub.s16 q13, q6, q2 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); + ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); + ;step2[27] = dct_const_round_shift(temp1); + ;step2[20] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[21] = step1b[22][i] - step1b[21][i]; + ;step1[26] = step1b[25][i] - step1b[26][i]; + vsub.s16 q14, q7, q1 + vsub.s16 q13, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); + ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); + ;step2[26] = dct_const_round_shift(temp1); + ;step2[21] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[18] = step1b[18][i] + step1b[21][i]; + ;step3[19] = step1b[19][i] + step1b[20][i]; + ;step3[20] = step1b[19][i] - step1b[20][i]; + ;step3[21] = step1b[18][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 + vadd.s16 q8, q14, q1 + vadd.s16 q9, q13, q6 + vsub.s16 q13, q13, q6 + vsub.s16 q1, q14, q1 + STORE_IN_OUTPUT 19, 18, 19, q8, q9 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[27] = step1b[28][i] - step1b[27][i]; + ;step3[28] = step1b[28][i] + step1b[27][i]; + ;step3[29] = step1b[29][i] + step1b[26][i]; + ;step3[26] = step1b[29][i] - step1b[26][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 + vsub.s16 q14, q8, q5 + vadd.s16 q10, q8, q5 + vadd.s16 q11, q9, q0 + vsub.s16 q0, q9, q0 + STORE_IN_OUTPUT 29, 28, 29, q10, q11 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; + ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; + ;step1[20] = dct_const_round_shift(temp1); + ;step1[27] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 29, 20, 27, q13, q14 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; + ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; + ;step1[21] = dct_const_round_shift(temp1); + ;step1[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 + STORE_IN_OUTPUT 27, 21, 26, q1, q0 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK C: 8-10,11-15 + ; -------------------------------------------------------------------------- + ; generate 8,9,14,15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; + ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; + ;step2[8] = dct_const_round_shift(temp1); + ;step2[15] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 3, 2, 30 + DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; + ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; + ;step2[9] = dct_const_round_shift(temp1); + ;step2[14] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 30, 18, 14 + DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[8] = step1b[8][i] + step1b[9][i]; + ;step3[9] = step1b[8][i] - step1b[9][i]; + ;step3[14] = step1b[15][i] - step1b[14][i]; + ;step3[15] = step1b[15][i] + step1b[14][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; + ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; + ;step1[9] = dct_const_round_shift(temp1); + ;step1[14] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 10,11,12,13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; + ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; + ;step2[10] = dct_const_round_shift(temp1); + ;step2[13] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 14, 10, 22 + DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; + ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; + ;step2[11] = dct_const_round_shift(temp1); + ;step2[12] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 22, 26, 6 + DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[10] = step1b[11][i] - step1b[10][i]; + ;step3[11] = step1b[11][i] + step1b[10][i]; + ;step3[12] = step1b[12][i] + step1b[13][i]; + ;step3[13] = step1b[12][i] - step1b[13][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); + ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); + ;step1[13] = dct_const_round_shift(temp1); + ;step1[10] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 8-10,11-15 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[8] = step1b[8][i] + step1b[11][i]; + ;step2[9] = step1b[9][i] + step1b[10][i]; + ;step2[10] = step1b[9][i] - step1b[10][i]; + vadd.s16 q8, q0, q5 + vadd.s16 q9, q1, q7 + vsub.s16 q13, q1, q7 + ;step2[13] = step1b[14][i] - step1b[13][i]; + ;step2[14] = step1b[14][i] + step1b[13][i]; + ;step2[15] = step1b[15][i] + step1b[12][i]; + vsub.s16 q14, q3, q4 + vadd.s16 q10, q3, q4 + vadd.s16 q15, q2, q6 + STORE_IN_OUTPUT 26, 8, 15, q8, q15 + STORE_IN_OUTPUT 15, 9, 14, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; + ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; + ;step3[10] = dct_const_round_shift(temp1); + ;step3[13] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 14, 13, 10, q3, q1 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[11] = step1b[8][i] - step1b[11][i]; + ;step2[12] = step1b[15][i] - step1b[12][i]; + vsub.s16 q13, q0, q5 + vsub.s16 q14, q2, q6 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; + ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; + ;step3[11] = dct_const_round_shift(temp1); + ;step3[12] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 10, 11, 12, q1, q3 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK D: 0-3,4-7 + ; -------------------------------------------------------------------------- + ; generate 4,5,6,7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; + ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; + ;step3[4] = dct_const_round_shift(temp1); + ;step3[7] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 6, 4, 28 + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; + ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; + ;step3[5] = dct_const_round_shift(temp1); + ;step3[6] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 28, 20, 12 + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[4] = step1b[4][i] + step1b[5][i]; + ;step1[5] = step1b[4][i] - step1b[5][i]; + ;step1[6] = step1b[7][i] - step1b[6][i]; + ;step1[7] = step1b[7][i] + step1b[6][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; + ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; + ;step2[5] = dct_const_round_shift(temp1); + ;step2[6] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 0,1,2,3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; + ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; + ;step1[1] = dct_const_round_shift(temp1); + ;step1[0] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 12, 0, 16 + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; + ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; + ;step1[2] = dct_const_round_shift(temp1); + ;step1[3] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 16, 8, 24 + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[0] = step1b[0][i] + step1b[3][i]; + ;step2[1] = step1b[1][i] + step1b[2][i]; + ;step2[2] = step1b[1][i] - step1b[2][i]; + ;step2[3] = step1b[0][i] - step1b[3][i]; + vadd.s16 q4, q7, q6 + vsub.s16 q7, q7, q6 + vsub.s16 q6, q5, q14 + vadd.s16 q5, q5, q14 + ; -------------------------------------------------------------------------- + ; combine 0-3,4-7 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[0] = step1b[0][i] + step1b[7][i]; + ;step3[1] = step1b[1][i] + step1b[6][i]; + ;step3[2] = step1b[2][i] + step1b[5][i]; + ;step3[3] = step1b[3][i] + step1b[4][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q3 + vadd.s16 q10, q6, q1 + vadd.s16 q11, q7, q0 + ;step3[4] = step1b[3][i] - step1b[4][i]; + ;step3[5] = step1b[2][i] - step1b[5][i]; + ;step3[6] = step1b[1][i] - step1b[6][i]; + ;step3[7] = step1b[0][i] - step1b[7][i]; + vsub.s16 q12, q7, q0 + vsub.s16 q13, q6, q1 + vsub.s16 q14, q5, q3 + vsub.s16 q15, q4, q2 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[0] = step1b[0][i] + step1b[15][i]; + ;step1[1] = step1b[1][i] + step1b[14][i]; + ;step1[14] = step1b[1][i] - step1b[14][i]; + ;step1[15] = step1b[0][i] - step1b[15][i]; + LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 + vadd.s16 q2, q8, q1 + vadd.s16 q3, q9, q0 + vsub.s16 q4, q9, q0 + vsub.s16 q5, q8, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[14 * 32] = step1b[14][i] + step1b[17][i]; + ;output[15 * 32] = step1b[15][i] + step1b[16][i]; + ;output[16 * 32] = step1b[15][i] - step1b[16][i]; + ;output[17 * 32] = step1b[14][i] - step1b[17][i]; + LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + + cmp r5, #0 + bgt idct32_bands_end_2nd_pass + +idct32_bands_end_1st_pass + STORE_IN_OUTPUT 17, 16, 17, q6, q7 + STORE_IN_OUTPUT 17, 14, 15, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 31, 30, 31, q6, q7 + STORE_IN_OUTPUT 31, 0, 1, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 19, 18, 19, q6, q7 + STORE_IN_OUTPUT 19, 12, 13, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 29, 28, 29, q6, q7 + STORE_IN_OUTPUT 29, 2, 3, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 21, 20, 21, q6, q7 + STORE_IN_OUTPUT 21, 10, 11, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 27, 26, 27, q6, q7 + STORE_IN_OUTPUT 27, 4, 5, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 23, 22, 23, q6, q7 + STORE_IN_OUTPUT 23, 8, 9, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 25, 24, 25, q6, q7 + STORE_IN_OUTPUT 25, 6, 7, q4, q5 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #7*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; parameters for second pass + ; the input of pass2 is the result of pass1. we have to remove the offset + ; of 32 columns induced by the above idct32_bands_loop + sub r3, r1, #32*2 + ; r1 = pass2[32 * 32] + add r1, sp, #2048 + + ; pass loop processing + add r5, r5, #1 + b idct32_pass_loop + +idct32_bands_end_2nd_pass + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; restore pointers to their initial indices for next band pass by + ; removing/adding dest_stride * 8. The actual increment by eight + ; is taken care of within the _LAST macros. + add r6, r6, r2, lsl #3 + add r9, r9, r2, lsl #3 + sub r7, r7, r2, lsl #3 + sub r10, r10, r2, lsl #3 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #25*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; stack operation + add sp, sp, #512+2048+2048 + vpop {d8-d15} + pop {r4-r11} + bx lr + ENDP ; |aom_idct32x32_1024_add_neon| + END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c new file mode 100644 index 000000000..a7562c7d5 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +#define LOAD_FROM_TRANSPOSED(prev, first, second) \ + q14s16 = vld1q_s16(trans_buf + first * 8); \ + q13s16 = vld1q_s16(trans_buf + second * 8); + +#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ + qA = vld1q_s16(out + first * 32); \ + qB = vld1q_s16(out + second * 32); + +#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ + vst1q_s16(out + first * 32, qA); \ + vst1q_s16(out + second * 32, qB); + +#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ + __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16); +static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2, + int stride, int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16) { + int16x4_t d8s16, d9s16, d10s16, d11s16; + + d8s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d11s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d9s16 = vld1_s16((int16_t *)p1); + d10s16 = vld1_s16((int16_t *)p2); + + q7s16 = vrshrq_n_s16(q7s16, 6); + q8s16 = vrshrq_n_s16(q8s16, 6); + q9s16 = vrshrq_n_s16(q9s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + + q7s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16))); + q8s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16))); + q9s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16))); + q6s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16))); + + d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); + d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); + d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + + vst1_s16((int16_t *)p1, d9s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d10s16); + p2 += stride; + vst1_s16((int16_t *)p1, d8s16); + vst1_s16((int16_t *)p2, d11s16); + return; +} + +#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \ + ; \ + __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16); +static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2, + int stride, int16x8_t q4s16, + int16x8_t q5s16, + int16x8_t q6s16, + int16x8_t q7s16) { + int16x4_t d4s16, d5s16, d6s16, d7s16; + + d4s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d7s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d5s16 = vld1_s16((int16_t *)p1); + d6s16 = vld1_s16((int16_t *)p2); + + q5s16 = vrshrq_n_s16(q5s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + q7s16 = vrshrq_n_s16(q7s16, 6); + q4s16 = vrshrq_n_s16(q4s16, 6); + + q5s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16))); + q6s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16))); + q7s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16))); + q4s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16))); + + d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); + d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + + vst1_s16((int16_t *)p1, d5s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d6s16); + p2 += stride; + vst1_s16((int16_t *)p2, d7s16); + vst1_s16((int16_t *)p1, d4s16); + return; +} + +#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ + DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); +static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, + int16_t first_const, int16_t second_const, + int16x8_t *qAs16, int16x8_t *qBs16) { + int16x4_t d30s16, d31s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; + int16x4_t dCs16, dDs16, dAs16, dBs16; + + dCs16 = vget_low_s16(q14s16); + dDs16 = vget_high_s16(q14s16); + dAs16 = vget_low_s16(q13s16); + dBs16 = vget_high_s16(q13s16); + + d30s16 = vdup_n_s16(first_const); + d31s16 = vdup_n_s16(second_const); + + q8s32 = vmull_s16(dCs16, d30s16); + q10s32 = vmull_s16(dAs16, d31s16); + q9s32 = vmull_s16(dDs16, d30s16); + q11s32 = vmull_s16(dBs16, d31s16); + q12s32 = vmull_s16(dCs16, d31s16); + + q8s32 = vsubq_s32(q8s32, q10s32); + q9s32 = vsubq_s32(q9s32, q11s32); + + q10s32 = vmull_s16(dDs16, d31s16); + q11s32 = vmull_s16(dAs16, d30s16); + q15s32 = vmull_s16(dBs16, d30s16); + + q11s32 = vaddq_s32(q12s32, q11s32); + q10s32 = vaddq_s32(q10s32, q15s32); + + *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14)); + *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14)); + return; +} + +static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { + int16_t *in; + int i; + const int stride = 32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + for (i = 0; i < 4; i++, input += 8) { + in = input; + q8s16 = vld1q_s16(in); + in += stride; + q9s16 = vld1q_s16(in); + in += stride; + q10s16 = vld1q_s16(in); + in += stride; + q11s16 = vld1q_s16(in); + in += stride; + q12s16 = vld1q_s16(in); + in += stride; + q13s16 = vld1q_s16(in); + in += stride; + q14s16 = vld1q_s16(in); + in += stride; + q15s16 = vld1q_s16(in); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + q12s16 = vcombine_s16(d17s16, d25s16); + q13s16 = vcombine_s16(d19s16, d27s16); + q14s16 = vcombine_s16(d21s16, d29s16); + q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + vst1q_s16(t_buf, q0x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q0x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[1]); + t_buf += 8; + } + return; +} + +static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, + int16x8_t q3s16, int16x8_t q6s16, + int16x8_t q7s16, int16x8_t q8s16, + int16x8_t q9s16, int16x8_t q10s16, + int16x8_t q11s16, int16x8_t q12s16, + int16x8_t q13s16, int16x8_t q14s16, + int16x8_t q15s16) { + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); + STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); + + LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); + STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); + + LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); + STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); + + LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); + STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); + + LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); + STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); + + LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); + STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); + + LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); + STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); + + LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); + STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); + return; +} + +static INLINE void idct32_bands_end_2nd_pass( + int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16, + int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16, + int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16, + int16x8_t q14s16, int16x8_t q15s16) { + uint8_t *r6 = dest + 31 * stride; + uint8_t *r7 = dest /* + 0 * stride*/; + uint8_t *r9 = dest + 15 * stride; + uint8_t *r10 = dest + 16 * stride; + int str2 = stride << 1; + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + + LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + return; +} + +void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) { + int i, idct32_pass_loop; + int16_t trans_buf[32 * 8]; + int16_t pass1[32 * 32]; + int16_t pass2[32 * 32]; + int16_t *out; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + + for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; + idct32_pass_loop++, + input = pass1, // the input of pass2 is the result of pass1 + out = pass2) { + for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + LOAD_FROM_TRANSPOSED(0, 1, 31) + DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(31, 17, 15) + DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + // part of stage 2 + q4s16 = vaddq_s16(q0s16, q1s16); + q13s16 = vsubq_s16(q0s16, q1s16); + q6s16 = vaddq_s16(q2s16, q3s16); + q14s16 = vsubq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + + // generate 18,19,28,29 + // part of stage 1 + LOAD_FROM_TRANSPOSED(15, 9, 23) + DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(23, 25, 7) + DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q3s16, q2s16); + q3s16 = vaddq_s16(q3s16, q2s16); + q14s16 = vsubq_s16(q1s16, q0s16); + q2s16 = vaddq_s16(q1s16, q0s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + // part of stage 4 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q10s16 = vaddq_s16(q7s16, q1s16); + q15s16 = vaddq_s16(q6s16, q3s16); + q13s16 = vsubq_s16(q5s16, q0s16); + q14s16 = vsubq_s16(q7s16, q1s16); + STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) + STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) + STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + // part of stage 4 + q13s16 = vsubq_s16(q4s16, q2s16); + q14s16 = vsubq_s16(q6s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) + STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + LOAD_FROM_TRANSPOSED(7, 5, 27) + DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(27, 21, 11) + DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + + // generate 22,23,24,25 + // part of stage 1 + LOAD_FROM_TRANSPOSED(11, 13, 19) + DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(19, 29, 3) + DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + // part of stage 2 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + // part of stage 4 + q10s16 = vaddq_s16(q7s16, q1s16); + q11s16 = vaddq_s16(q5s16, q0s16); + q12s16 = vaddq_s16(q6s16, q2s16); + q15s16 = vaddq_s16(q4s16, q3s16); + // part of stage 6 + LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q11s16); + q9s16 = vaddq_s16(q13s16, q10s16); + q13s16 = vsubq_s16(q13s16, q10s16); + q11s16 = vsubq_s16(q14s16, q11s16); + STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) + LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) + q8s16 = vsubq_s16(q9s16, q12s16); + q10s16 = vaddq_s16(q14s16, q15s16); + q14s16 = vsubq_s16(q14s16, q15s16); + q12s16 = vaddq_s16(q9s16, q12s16); + STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) + q13s16 = q11s16; + q14s16 = q8s16; + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + // part of stage 4 + q14s16 = vsubq_s16(q5s16, q0s16); + q13s16 = vsubq_s16(q6s16, q2s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); + q14s16 = vsubq_s16(q7s16, q1s16); + q13s16 = vsubq_s16(q4s16, q3s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + // part of stage 6 + LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q1s16); + q9s16 = vaddq_s16(q13s16, q6s16); + q13s16 = vsubq_s16(q13s16, q6s16); + q1s16 = vsubq_s16(q14s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) + LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) + q14s16 = vsubq_s16(q8s16, q5s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q9s16, q0s16); + q0s16 = vsubq_s16(q9s16, q0s16); + STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) + DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16); + STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + LOAD_FROM_TRANSPOSED(3, 2, 30) + DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(30, 18, 14) + DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + // part of stage 3 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 4 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + + // generate 10,11,12,13 + // part of stage 2 + LOAD_FROM_TRANSPOSED(14, 10, 22) + DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(22, 26, 6) + DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + // part of stage 3 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 4 + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + // part of stage 5 + q8s16 = vaddq_s16(q0s16, q5s16); + q9s16 = vaddq_s16(q1s16, q7s16); + q13s16 = vsubq_s16(q1s16, q7s16); + q14s16 = vsubq_s16(q3s16, q4s16); + q10s16 = vaddq_s16(q3s16, q4s16); + q15s16 = vaddq_s16(q2s16, q6s16); + STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) + STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + // part of stage 6 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) + q13s16 = vsubq_s16(q0s16, q5s16); + q14s16 = vsubq_s16(q2s16, q6s16); + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + LOAD_FROM_TRANSPOSED(6, 4, 28) + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(28, 20, 12) + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + // part of stage 4 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + + // generate 0,1,2,3 + // part of stage 4 + LOAD_FROM_TRANSPOSED(12, 0, 16) + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(16, 8, 24) + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + // part of stage 5 + q4s16 = vaddq_s16(q7s16, q6s16); + q7s16 = vsubq_s16(q7s16, q6s16); + q6s16 = vsubq_s16(q5s16, q14s16); + q5s16 = vaddq_s16(q5s16, q14s16); + // part of stage 6 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q3s16); + q10s16 = vaddq_s16(q6s16, q1s16); + q11s16 = vaddq_s16(q7s16, q0s16); + q12s16 = vsubq_s16(q7s16, q0s16); + q13s16 = vsubq_s16(q6s16, q1s16); + q14s16 = vsubq_s16(q5s16, q3s16); + q15s16 = vsubq_s16(q4s16, q2s16); + // part of stage 7 + LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) + q2s16 = vaddq_s16(q8s16, q1s16); + q3s16 = vaddq_s16(q9s16, q0s16); + q4s16 = vsubq_s16(q9s16, q0s16); + q5s16 = vsubq_s16(q8s16, q1s16); + LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, + q15s16); + } else { + idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16, + q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, + q14s16, q15s16); + dest += 8; + } + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm new file mode 100644 index 000000000..6bd733d5d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm @@ -0,0 +1,71 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct4x4_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct4x4_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.s16 q0, r0 ; duplicate a1 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q8, q0, d2 ; dest[x] + a1 + vaddw.u8 q9, q0, d4 + + vqmovun.s16 d6, q8 ; clip_pixel + vqmovun.s16 d7, q9 + + vst1.32 {d6[0]}, [r12], r2 + vst1.32 {d6[1]}, [r12], r2 + vst1.32 {d7[0]}, [r12], r2 + vst1.32 {d7[1]}, [r12] + + bx lr + ENDP ; |aom_idct4x4_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c new file mode 100644 index 000000000..3df7a901b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d6u8; + uint32x2_t d2u32 = vdup_n_u32(0); + uint16x8_t q8u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + q0s16 = vdupq_n_s16(a1); + + // dc_only_idct_add + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); + d1 += dest_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); + d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); + d2 += dest_stride; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); + d2 += dest_stride; + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm new file mode 100644 index 000000000..127acf614 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm @@ -0,0 +1,193 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct4x4_16_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY ; name this block of code +;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct4x4_16_add_neon| PROC + + ; The 2D transform is done with two passes which are actually pretty + ; similar. We first transform the rows. This is done by transposing + ; the inputs, doing an SIMD column transform (the columns are the + ; transposed rows) and then transpose the results (so that it goes back + ; in normal/row positions). Then, we transform the columns by doing + ; another SIMD column transform. + ; So, two passes of a transpose followed by a column transform. + + ; load the inputs into q8-q9, d16-d19 + vld1.s16 {q8,q9}, [r0]! + + ; generate scalar constants + ; cospi_8_64 = 15137 = 0x3b21 + mov r0, #0x3b00 + add r0, #0x21 + ; cospi_16_64 = 11585 = 0x2d41 + mov r3, #0x2d00 + add r3, #0x41 + ; cospi_24_64 = 6270 = 0x 187e + mov r12, #0x1800 + add r12, #0x7e + + ; transpose the input data + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + + ; generate constant vectors + vdup.16 d20, r0 ; replicate cospi_8_64 + vdup.16 d21, r3 ; replicate cospi_16_64 + + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + vdup.16 d22, r12 ; replicate cospi_24_64 + + ; do the transform on transposed rows + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + + ; transpose the results + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + ; do the transform on columns + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + + ; The results are in two registers, one of them being swapped. This will + ; be taken care of by loading the 'dest' value in a swapped fashion and + ; also storing them in the same swapped fashion. + ; temp_out[0, 1] = d16, d17 = q8 + ; temp_out[2, 3] = d19, d18 = q9 swapped + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1] ; no post-increment + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |aom_idct4x4_16_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c new file mode 100644 index 000000000..763be1ab0 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/txfm_common.h" + +void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d26u8, d27u8; + uint32x2_t d26u32, d27u32; + uint16x8_t q8u16, q9u16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; + int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; + int16x8_t q8s16, q9s16, q13s16, q14s16; + int32x4_t q1s32, q13s32, q14s32, q15s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + uint8_t *d; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + d20s16 = vdup_n_s16((int16_t)cospi_8_64); + d21s16 = vdup_n_s16((int16_t)cospi_16_64); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + d22s16 = vdup_n_s16((int16_t)cospi_24_64); + + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_high_s16(q9s16); // vswp d18 d19 + d19s16 = vget_low_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + // do the transform on columns + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d = dest; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); + d += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + d = dest; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm new file mode 100644 index 000000000..ec07e2053 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm @@ -0,0 +1,91 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct8x8_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 5) + add r0, r0, #16 ; + (1 <<((5) - 1)) + asr r0, r0, #5 ; >> 5 + + vdup.s16 q0, r0 ; duplicate a1 + + ; load destination data + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r2 + vld1.64 {d17}, [r1] + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |aom_idct8x8_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c new file mode 100644 index 000000000..c7926f9e4 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d5u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm new file mode 100644 index 000000000..f3d5f246d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm @@ -0,0 +1,522 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct8x8_64_add_neon| + EXPORT |aom_idct8x8_12_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are + ; loaded in q8-q15. The output will be stored back into q8-q15 registers. + ; This macro will touch q0-q7 registers and use them as buffer during + ; calculation. + MACRO + IDCT8x8_1D + ; stage 1 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r4 ; duplicate cospi_4_64 + vdup.16 d2, r5 ; duplicate cospi_12_64 + vdup.16 d3, r6 ; duplicate cospi_20_64 + + ; input[1] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; input[5] * cospi_12_64 + vmull.s16 q5, d26, d2 + vmull.s16 q6, d27, d2 + + ; input[1]*cospi_28_64-input[7]*cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q5, d22, d3 + vmlsl.s16 q6, d23, d3 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q5, #14 ; >> 14 + vqrshrn.s32 d11, q6, #14 ; >> 14 + + ; input[1] * cospi_4_64 + vmull.s16 q2, d18, d1 + vmull.s16 q3, d19, d1 + + ; input[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q13, d27, d3 + + ; input[1]*cospi_4_64+input[7]*cospi_28_64 + vmlal.s16 q2, d30, d0 + vmlal.s16 q3, d31, d0 + + ; input[5] * cospi_20_64 + input[3] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q13, d23, d2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d14, q2, #14 ; >> 14 + vqrshrn.s32 d15, q3, #14 ; >> 14 + + ; stage 2 & stage 3 - even half + vdup.16 d0, r7 ; duplicate cospi_16_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q13, #14 ; >> 14 + + ; input[0] * cospi_16_64 + vmull.s16 q2, d16, d0 + vmull.s16 q3, d17, d0 + + ; input[0] * cospi_16_64 + vmull.s16 q13, d16, d0 + vmull.s16 q15, d17, d0 + + ; (input[0] + input[2]) * cospi_16_64 + vmlal.s16 q2, d24, d0 + vmlal.s16 q3, d25, d0 + + ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q13, d24, d0 + vmlsl.s16 q15, d25, d0 + + vdup.16 d0, r8 ; duplicate cospi_24_64 + vdup.16 d1, r9 ; duplicate cospi_8_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d18, q2, #14 ; >> 14 + vqrshrn.s32 d19, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d22, q13, #14 ; >> 14 + vqrshrn.s32 d23, q15, #14 ; >> 14 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + ; input[1] * cospi_24_64 + vmull.s16 q2, d20, d0 + vmull.s16 q3, d21, d0 + + ; input[1] * cospi_8_64 + vmull.s16 q8, d20, d1 + vmull.s16 q12, d21, d1 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q2, d28, d1 + vmlsl.s16 q3, d29, d1 + + ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q8, d28, d0 + vmlal.s16 q12, d29, d0 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d26, q2, #14 ; >> 14 + vqrshrn.s32 d27, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d30, q8, #14 ; >> 14 + vqrshrn.s32 d31, q12, #14 ; >> 14 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + MEND + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_64_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + IDCT8x8_1D + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |aom_idct8x8_64_add_neon| + +;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_12_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + ; stage 1 + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling + ; multiply and shift the result by 16 bits instead of 14 bits. So we need + ; to double the constants before multiplying to compensate this. + mov r12, r3, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_28_64*2 + mov r12, r4, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; dct_const_round_shift(input[1] * cospi_28_64) + vqrdmulh.s16 q4, q9, q0 + + mov r12, r6, lsl #1 + rsb r12, #0 + vdup.16 q0, r12 ; duplicate -cospi_20_64*2 + + ; dct_const_round_shift(input[1] * cospi_4_64) + vqrdmulh.s16 q7, q9, q1 + + mov r12, r5, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_12_64*2 + + ; dct_const_round_shift(- input[3] * cospi_20_64) + vqrdmulh.s16 q5, q11, q0 + + mov r12, r7, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_16_64*2 + + ; dct_const_round_shift(input[3] * cospi_12_64) + vqrdmulh.s16 q6, q11, q1 + + ; stage 2 & stage 3 - even half + mov r12, r8, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_24_64*2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrdmulh.s16 q9, q8, q0 + + mov r12, r9, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_8_64*2 + + ; dct_const_round_shift(input[1] * cospi_24_64) + vqrdmulh.s16 q13, q10, q1 + + ; dct_const_round_shift(input[1] * cospi_8_64) + vqrdmulh.s16 q15, q10, q0 + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |aom_idct8x8_12_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c new file mode 100644 index 000000000..8ad70862d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c @@ -0,0 +1,509 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16((int16_t)cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16((int16_t)cospi_24_64); + d1s16 = vdup_n_s16((int16_t)cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16((int16_t)cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; +} + +void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} + +void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // First transform rows + // stage 1 + q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + + q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2); + + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2); + + q5s16 = vqrdmulhq_s16(q11s16, q0s16); + + q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); + + q6s16 = vqrdmulhq_s16(q11s16, q1s16); + + // stage 2 & stage 3 - even half + q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2); + + q9s16 = vqrdmulhq_s16(q8s16, q0s16); + + q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2); + + q13s16 = vqrdmulhq_s16(q10s16, q1s16); + + q15s16 = vqrdmulhq_s16(q10s16, q0s16); + + // stage 3 -odd half + q0s16 = vaddq_s16(q9s16, q15s16); + q1s16 = vaddq_s16(q9s16, q13s16); + q2s16 = vsubq_s16(q9s16, q13s16); + q3s16 = vsubq_s16(q9s16, q15s16); + + // stage 2 - odd half + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d16s16 = vdup_n_s16((int16_t)cospi_16_64); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + q8s16 = vaddq_s16(q0s16, q7s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q7s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c new file mode 100644 index 000000000..2dc5b2e56 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_top = vcombine_u16(p1, p1); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_left = vcombine_u16(p1, p1); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 3); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 2); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 2); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); + } + } +} + +void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_4x4(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_4x4(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_4x4(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_4x4(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_top = vcombine_u16(p2, p2); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_left = vcombine_u16(p2, p2); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 4); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 3); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 3); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 8; ++i) { + vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); + } + } +} + +void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_8x8(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_8x8(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_8x8(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_8x8(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A = vld1q_u8(above); // top row + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_top = vcombine_u16(p3, p3); + } + + if (do_left) { + const uint8x16_t L = vld1q_u8(left); // left row + const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_left = vcombine_u16(p3, p3); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 5); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 4); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 4); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 16; ++i) { + vst1q_u8(dst + i * stride, dc); + } + } +} + +void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_16x16(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_16x16(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_16x16(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_16x16(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t A1 = vld1q_u8(above + 16); + const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top + const uint16x8_t p1 = vpaddlq_u8(A1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_top = vcombine_u16(p5, p5); + } + + if (do_left) { + const uint8x16_t L0 = vld1q_u8(left); // left row + const uint8x16_t L1 = vld1q_u8(left + 16); + const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left + const uint16x8_t p1 = vpaddlq_u8(L1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_left = vcombine_u16(p5, p5); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 6); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 5); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 5); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 32; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); + } + } +} + +void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_32x32(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_32x32(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_32x32(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_32x32(dst, stride, NULL, NULL, 0, 0); +} + +// ----------------------------------------------------------------------------- + +void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XABCD_u8 = vld1_u8(above - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32x2_t zero = vdup_n_u32(0); + const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); + const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); + const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); +} + +#if !HAVE_NEON_ASM + +void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; + + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); +} + +void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; + + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); +} + +void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); +} + +void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } +} + +void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; + + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); +} + +void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; + + d1u64 = vld1_u64((const uint64_t *)left); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); +} + +void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += stride; + } +} + +void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + for (k = 0; k < 2; k++, left += 16) { + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + } + } +} + +void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint16x8_t q1u16, q3u16; + int16x8_t q1s16; + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d2u32 = vdup_n_u32(0); + + d0u8 = vld1_dup_u8(above - 1); + d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); + q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); + for (i = 0; i < 4; i++, dst += stride) { + q1u16 = vdupq_n_u16((uint16_t)left[i]); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); + d0u8 = vqmovun_s16(q1s16); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + } +} + +void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint16x8_t q0u16, q3u16, q10u16; + int16x8_t q0s16; + uint16x4_t d20u16; + uint8x8_t d0u8, d2u8, d30u8; + + d0u8 = vld1_dup_u8(above - 1); + d30u8 = vld1_u8(left); + d2u8 = vld1_u8(above); + q10u16 = vmovl_u8(d30u8); + q3u16 = vsubl_u8(d2u8, d0u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 1); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 2); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 3); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + } +} + +void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; + uint8x16_t q0u8, q1u8; + int16x8_t q0s16, q1s16, q8s16, q11s16; + uint16x4_t d20u16; + uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + for (k = 0; k < 2; k++, left += 8) { + d18u8 = vld1_u8(left); + q10u16 = vmovl_u8(d18u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q8u16 = vdupq_lane_u16(d20u16, 1); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); + q11s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); + q8s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d20u16, 2); + q8u16 = vdupq_lane_u16(d20u16, 3); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); + q11s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); + q8s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; + } + } +} + +void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; + uint8x16_t q0u8, q1u8, q2u8; + int16x8_t q12s16, q13s16, q14s16, q15s16; + uint16x4_t d6u16; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u8 = vld1q_u8(above + 16); + q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); + for (k = 0; k < 4; k++, left += 8) { + d26u8 = vld1_u8(left); + q3u16 = vmovl_u8(d26u8); + d6u16 = vget_low_u16(q3u16); + for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { + q0u16 = vdupq_lane_u16(d6u16, 0); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 1); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 2); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 3); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + } + } +} +#endif // !HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm new file mode 100644 index 000000000..7d04d3553 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm @@ -0,0 +1,633 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_v_predictor_4x4_neon| + EXPORT |aom_v_predictor_8x8_neon| + EXPORT |aom_v_predictor_16x16_neon| + EXPORT |aom_v_predictor_32x32_neon| + EXPORT |aom_h_predictor_4x4_neon| + EXPORT |aom_h_predictor_8x8_neon| + EXPORT |aom_h_predictor_16x16_neon| + EXPORT |aom_h_predictor_32x32_neon| + EXPORT |aom_tm_predictor_4x4_neon| + EXPORT |aom_tm_predictor_8x8_neon| + EXPORT |aom_tm_predictor_16x16_neon| + EXPORT |aom_tm_predictor_32x32_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_4x4_neon| PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |aom_v_predictor_4x4_neon| + +;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_8x8_neon| PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + ENDP ; |aom_v_predictor_8x8_neon| + +;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_16x16_neon| PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |aom_v_predictor_16x16_neon| + +;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_32x32_neon| PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + ENDP ; |aom_v_predictor_32x32_neon| + +;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_4x4_neon| PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |aom_h_predictor_4x4_neon| + +;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_8x8_neon| PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + ENDP ; |aom_h_predictor_8x8_neon| + +;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_16x16_neon| PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |aom_h_predictor_16x16_neon| + +;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_32x32_neon| PROC + sub r1, r1, #16 + mov r2, #2 +loop_h + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + ENDP ; |aom_h_predictor_32x32_neon| + +;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_4x4_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + ; Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + ; 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + ENDP ; |aom_tm_predictor_4x4_neon| + +;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_8x8_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; preload 8 left + vld1.8 {d30}, [r3] + + ; Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + ; 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + ENDP ; |aom_tm_predictor_8x8_neon| + +;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_16x16_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 8 pixels + vld1.8 {q1}, [r2] + + ; preload 8 left into r12 + vld1.8 {d18}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon + ; Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] ; proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + ; Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] ; proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] ; proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + ENDP ; |aom_tm_predictor_16x16_neon| + +;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_32x32_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + ; preload 8 left pixels + vld1.8 {d26}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon + ; Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! ; preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + ENDP ; |aom_tm_predictor_32x32_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm new file mode 100644 index 000000000..b6e2c9edb --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm @@ -0,0 +1,202 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_4_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|aom_lpf_horizontal_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + add r1, r1, r1 ; double pitch + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, r1, lsl #1 ; s[-4 * p] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + add r3, r2, r1, lsr #1 ; s[-3 * p] + + vld1.u8 {q3}, [r2@64], r1 ; p3 + vld1.u8 {q4}, [r3@64], r1 ; p2 + vld1.u8 {q5}, [r2@64], r1 ; p1 + vld1.u8 {q6}, [r3@64], r1 ; p0 + vld1.u8 {q7}, [r2@64], r1 ; q0 + vld1.u8 {q8}, [r3@64], r1 ; q1 + vld1.u8 {q9}, [r2@64] ; q2 + vld1.u8 {q10}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl aom_loop_filter_neon_16 + + vst1.u8 {q5}, [r2@64], r1 ; store op1 + vst1.u8 {q6}, [r3@64], r1 ; store op0 + vst1.u8 {q7}, [r2@64], r1 ; store oq0 + vst1.u8 {q8}, [r3@64], r1 ; store oq1 + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |aom_lpf_horizontal_4_dual_neon| + +; void aom_loop_filter_neon_16(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. This function uses +; registers d8-d15, so the calling function must save those registers. +; +; r0-r3, r12 PRESERVE +; q0 blimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +; +; Outputs: +; q5 op1 +; q6 op0 +; q7 oq0 +; q8 oq1 +|aom_loop_filter_neon_16| PROC + + ; filter_mask + vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) + vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) + vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) + vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) + vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) + vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + + vmov.u8 q10, #0x80 + + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + veor q7, q7, q10 ; qs0 + + vcge.u8 q15, q1, q15 ; abs(m11) > limit + + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u16 q4, #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; a > blimit + + vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; hev + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; filter &= hev + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 + + vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; filter &= mask + + vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) + vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) + vshr.s8 q2, q2, #3 ; filter2 >>= 3 + vshr.s8 q1, q1, #3 ; filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) + vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + + veor q7, q0, q10 ; *oq0 = u^0x80 + + vbic q1, q1, q14 ; filter &= ~hev + + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + + veor q6, q11, q10 ; *op0 = u^0x80 + veor q5, q13, q10 ; *op1 = u^0x80 + veor q8, q12, q10 ; *oq1 = u^0x80 + + bx lr + ENDP ; |aom_loop_filter_neon_16| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c new file mode 100644 index 000000000..c0562a6ea --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" +#include "aom/aom_integer.h" + +static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // aom_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // aom_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vdupq_n_u16(3); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q4 = vdupq_n_u8(3); + q9 = vdupq_n_u8(4); + // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); + *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); + return; +} + +void aom_lpf_horizontal_4_dual_neon( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; + uint8x16_t qblimit, qlimit, qthresh; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + + dblimit0 = vld1_u8(blimit0); + dlimit0 = vld1_u8(limit0); + dthresh0 = vld1_u8(thresh0); + dblimit1 = vld1_u8(blimit1); + dlimit1 = vld1_u8(limit1); + dthresh1 = vld1_u8(thresh1); + qblimit = vcombine_u8(dblimit0, dblimit1); + qlimit = vcombine_u8(dlimit0, dlimit1); + qthresh = vcombine_u8(dthresh0, dthresh1); + + s -= (p << 2); + + q3u8 = vld1q_u8(s); + s += p; + q4u8 = vld1q_u8(s); + s += p; + q5u8 = vld1q_u8(s); + s += p; + q6u8 = vld1q_u8(s); + s += p; + q7u8 = vld1q_u8(s); + s += p; + q8u8 = vld1q_u8(s); + s += p; + q9u8 = vld1q_u8(s); + s += p; + q10u8 = vld1q_u8(s); + + loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, + q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); + + s -= (p * 5); + vst1q_u8(s, q5u8); + s += p; + vst1q_u8(s, q6u8); + s += p; + vst1q_u8(s, q7u8); + s += p; + vst1q_u8(s, q8u8); + return; +} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm new file mode 100644 index 000000000..8b54984d5 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm @@ -0,0 +1,252 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_4_neon| + EXPORT |aom_lpf_vertical_4_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_horizontal_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #4] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r3, r2, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r3@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r3@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl aom_loop_filter_neon + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r3@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r3@64], r1 ; store oq1 + + pop {pc} + ENDP ; |aom_lpf_horizontal_4_neon| + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl aom_loop_filter_neon + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + pop {pc} + ENDP ; |aom_lpf_vertical_4_neon| + +; void aom_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 +|aom_loop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit + + ; filter() function + ; convert to signed + + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; a > blimit + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |aom_loop_filter_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c new file mode 100644 index 000000000..2b1f80b81 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p3 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d4ru8, // p1 + uint8x8_t *d5ru8, // p0 + uint8x8_t *d6ru8, // q0 + uint8x8_t *d7ru8) { // q1 + uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; + int16x8_t q12s16; + int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d3u8 = vabd_u8(d17u8, d16u8); + d4u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + d3u8 = vmax_u8(d3u8, d4u8); + d23u8 = vmax_u8(d19u8, d20u8); + + d17u8 = vabd_u8(d6u8, d7u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + d22u8 = vcgt_u8(d22u8, dthresh); + d23u8 = vmax_u8(d23u8, d3u8); + + d28u8 = vabd_u8(d5u8, d16u8); + d17u8 = vqadd_u8(d17u8, d17u8); + + d23u8 = vcge_u8(dlimit, d23u8); + + d18u8 = vdup_n_u8(0x80); + d5u8 = veor_u8(d5u8, d18u8); + d6u8 = veor_u8(d6u8, d18u8); + d7u8 = veor_u8(d7u8, d18u8); + d16u8 = veor_u8(d16u8, d18u8); + + d28u8 = vshr_n_u8(d28u8, 1); + d17u8 = vqadd_u8(d17u8, d28u8); + + d19u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8)); + + d17u8 = vcge_u8(dblimit, d17u8); + + d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8)); + + d22u8 = vorr_u8(d21u8, d22u8); + + q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); + d23u8 = vand_u8(d23u8, d17u8); + + q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); + + d17u8 = vdup_n_u8(4); + + d27s8 = vqmovn_s16(q12s16); + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); + d27s8 = vreinterpret_s8_u8(d27u8); + + d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); + d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); + d28s8 = vshr_n_s8(d28s8, 3); + d27s8 = vshr_n_s8(d27s8, 3); + + d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); + + d27s8 = vrshr_n_s8(d27s8, 1); + d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); + + d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); + d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); + + *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); + *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); + *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); + *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); + return; +} + +void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); + + s -= (pitch * 5); + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + s += pitch; + vst1_u8(s, d6u8); + s += pitch; + vst1_u8(s, d7u8); + } + return; +} + +void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i, pitch8; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + pitch8 = pitch * 8; + for (i = 0; i < 1; i++, src += pitch8) { + s = src - (i + 1) * 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); + + d4Result.val[0] = d4u8; + d4Result.val[1] = d5u8; + d4Result.val[2] = d6u8; + d4Result.val[3] = d7u8; + + src -= 2; + vst4_lane_u8(src, d4Result, 0); + src += pitch; + vst4_lane_u8(src, d4Result, 1); + src += pitch; + vst4_lane_u8(src, d4Result, 2); + src += pitch; + vst4_lane_u8(src, d4Result, 3); + src += pitch; + vst4_lane_u8(src, d4Result, 4); + src += pitch; + vst4_lane_u8(src, d4Result, 5); + src += pitch; + vst4_lane_u8(src, d4Result, 6); + src += pitch; + vst4_lane_u8(src, d4Result, 7); + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm new file mode 100644 index 000000000..9f3db66ee --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm @@ -0,0 +1,428 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_8_neon| + EXPORT |aom_lpf_vertical_8_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_horizontal_8_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_horizontal_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #12] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r2, r3, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r3@64], r1 ; p3 + vld1.u8 {d4}, [r2@64], r1 ; p2 + vld1.u8 {d5}, [r3@64], r1 ; p1 + vld1.u8 {d6}, [r2@64], r1 ; p0 + vld1.u8 {d7}, [r3@64], r1 ; q0 + vld1.u8 {d16}, [r2@64], r1 ; q1 + vld1.u8 {d17}, [r3@64] ; q2 + vld1.u8 {d18}, [r2@64], r1 ; q3 + + sub r3, r3, r1, lsl #1 + sub r2, r2, r1, lsl #2 + + bl aom_mbloop_filter_neon + + vst1.u8 {d0}, [r2@64], r1 ; store op2 + vst1.u8 {d1}, [r3@64], r1 ; store op1 + vst1.u8 {d2}, [r2@64], r1 ; store op0 + vst1.u8 {d3}, [r3@64], r1 ; store oq0 + vst1.u8 {d4}, [r2@64], r1 ; store oq1 + vst1.u8 {d5}, [r3@64], r1 ; store oq2 + + pop {r4-r5, pc} + + ENDP ; |aom_lpf_horizontal_8_neon| + +; void aom_lpf_vertical_8_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #12] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + sub r2, r0, #3 + add r3, r0, #1 + + bl aom_mbloop_filter_neon + + ;store op2, op1, op0, oq0 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] + + ;store oq1, oq2 + vst2.8 {d4[0], d5[0]}, [r3], r1 + vst2.8 {d4[1], d5[1]}, [r3], r1 + vst2.8 {d4[2], d5[2]}, [r3], r1 + vst2.8 {d4[3], d5[3]}, [r3], r1 + vst2.8 {d4[4], d5[4]}, [r3], r1 + vst2.8 {d4[5], d5[5]}, [r3], r1 + vst2.8 {d4[6], d5[6]}, [r3], r1 + vst2.8 {d4[7], d5[7]}, [r3] + + pop {r4-r5, pc} + ENDP ; |aom_lpf_vertical_8_neon| + +; void aom_mbloop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d0 op2 +; d1 op1 +; d2 op0 +; d3 oq0 +; d4 oq1 +; d5 oq2 +|aom_mbloop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) + + vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) + + vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) + + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) + vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) + vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d1, d19 + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) + vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) + + vshr.u8 d23, d23, #1 ; a = a / 2 + + vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) + + vqadd.u8 d24, d24, d23 ; a = b + a + + vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) + + vmov.u8 d23, #1 + vcge.u8 d24, d0, d24 ; a > blimit + + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + + vcge.u8 d20, d23, d20 ; flat + + vand d19, d19, d24 ; mask + + vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + + vand d20, d20, d19 ; flat & mask + + vmov.u8 d22, #0x80 + + vorr d23, d21, d23 ; hev + + ; This instruction will truncate the "flat & mask" masks down to 4 bits + ; each to fit into one 32 bit arm register. The values are stored in + ; q10.64[0]. + vshrn.u16 d30, q10, #4 + vmov.u32 r4, d30[0] ; flat & mask 4bits + + adds r5, r4, #1 ; Check for all 1's + + ; If mask and flat are 1's for all vectors, then we only need to execute + ; the power branch for all vectors. + beq power_branch_only + + cmp r4, #0 ; Check for 0, set flag for later + + ; mbfilter() function + ; filter() function + ; convert to signed + veor d21, d7, d22 ; qs0 + veor d24, d6, d22 ; ps0 + veor d25, d5, d22 ; ps1 + veor d26, d16, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d21, d24 ; ( qs0 - ps0) + + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + + vand d29, d29, d23 ; filter &= hev + + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d23 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + ; If mask and flat are 0's for all vectors, then we only need to execute + ; the filter branch for all vectors. + beq filter_branch_only + + ; If mask and flat are mixed then we must perform both branches and + ; combine the data. + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d21, d21, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + ; At this point we have already executed the filter branch. The filter + ; branch does not set op2 or oq2, so use p2 and q2. Execute the power + ; branch and combine the data. + vmov.u8 d23, #2 + vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 + vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 + + vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) + + vaddw.u8 q14, d5 ; r_op2 += p1 + + vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) + + vqrshrn.u16 d30, q14, #3 ; r_op2 + + vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 + vsubw.u8 q14, d4 ; r_op1 -= p2 + vaddw.u8 q14, d5 ; r_op1 += p1 + vaddw.u8 q14, d16 ; r_op1 += q1 + + vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) + + vqrshrn.u16 d31, q14, #3 ; r_op1 + + vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 + vsubw.u8 q14, d5 ; r_op0 -= p1 + vaddw.u8 q14, d6 ; r_op0 += p0 + vaddw.u8 q14, d17 ; r_op0 += q2 + + vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) + + vqrshrn.u16 d23, q14, #3 ; r_op0 + + vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 + vsubw.u8 q14, d6 ; r_oq0 -= p0 + vaddw.u8 q14, d7 ; r_oq0 += q0 + + vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) + + vaddw.u8 q14, d18 ; oq0 += q3 + + vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) + + vqrshrn.u16 d22, q14, #3 ; r_oq0 + + vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 + vsubw.u8 q14, d7 ; r_oq1 -= q0 + vaddw.u8 q14, d16 ; r_oq1 += q1 + + vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) + + vaddw.u8 q14, d18 ; r_oq1 += q3 + + vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) + + vqrshrn.u16 d6, q14, #3 ; r_oq1 + + vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 + vsubw.u8 q14, d16 ; r_oq2 -= q1 + vaddw.u8 q14, d17 ; r_oq2 += q2 + vaddw.u8 q14, d18 ; r_oq2 += q3 + + vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) + + vqrshrn.u16 d7, q14, #3 ; r_oq2 + + vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) + vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) + vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) + + bx lr + +power_branch_only + vmov.u8 d27, #3 + vmov.u8 d21, #2 + vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 + vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 + vaddw.u8 q14, d5 ; op2 += p1 + vqrshrn.u16 d0, q14, #3 ; op2 + + vsubw.u8 q14, d3 ; op1 = op2 - p3 + vsubw.u8 q14, d4 ; op1 -= p2 + vaddw.u8 q14, d5 ; op1 += p1 + vaddw.u8 q14, d16 ; op1 += q1 + vqrshrn.u16 d1, q14, #3 ; op1 + + vsubw.u8 q14, d3 ; op0 = op1 - p3 + vsubw.u8 q14, d5 ; op0 -= p1 + vaddw.u8 q14, d6 ; op0 += p0 + vaddw.u8 q14, d17 ; op0 += q2 + vqrshrn.u16 d2, q14, #3 ; op0 + + vsubw.u8 q14, d3 ; oq0 = op0 - p3 + vsubw.u8 q14, d6 ; oq0 -= p0 + vaddw.u8 q14, d7 ; oq0 += q0 + vaddw.u8 q14, d18 ; oq0 += q3 + vqrshrn.u16 d3, q14, #3 ; oq0 + + vsubw.u8 q14, d4 ; oq1 = oq0 - p2 + vsubw.u8 q14, d7 ; oq1 -= q0 + vaddw.u8 q14, d16 ; oq1 += q1 + vaddw.u8 q14, d18 ; oq1 += q3 + vqrshrn.u16 d4, q14, #3 ; oq1 + + vsubw.u8 q14, d5 ; oq2 = oq1 - p1 + vsubw.u8 q14, d16 ; oq2 -= q1 + vaddw.u8 q14, d17 ; oq2 += q2 + vaddw.u8 q14, d18 ; oq2 += q3 + vqrshrn.u16 d5, q14, #3 ; oq2 + + bx lr + +filter_branch_only + ; TODO(fgalligan): See if we can rearange registers so we do not need to + ; do the 2 vswp. + vswp d0, d4 ; op2 + vswp d5, d17 ; oq2 + veor d2, d24, d22 ; *op0 = u^0x80 + veor d3, d21, d22 ; *oq0 = u^0x80 + veor d1, d25, d22 ; *op1 = u^0x80 + veor d4, d26, d22 ; *oq1 = u^0x80 + + bx lr + + ENDP ; |aom_mbloop_filter_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c new file mode 100644 index 000000000..c4502fdb5 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p2 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d0ru8, // p1 + uint8x8_t *d1ru8, // p1 + uint8x8_t *d2ru8, // p0 + uint8x8_t *d3ru8, // q0 + uint8x8_t *d4ru8, // q1 + uint8x8_t *d5ru8) { // q1 + uint32_t flat; + uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; + uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int16x8_t q15s16; + uint16x8_t q10u16, q14u16; + int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d23u8 = vabd_u8(d17u8, d16u8); + d24u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + + d25u8 = vabd_u8(d6u8, d4u8); + + d23u8 = vmax_u8(d23u8, d24u8); + + d26u8 = vabd_u8(d7u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + + d24u8 = vabd_u8(d6u8, d7u8); + d27u8 = vabd_u8(d3u8, d6u8); + d28u8 = vabd_u8(d18u8, d7u8); + + d19u8 = vmax_u8(d19u8, d23u8); + + d23u8 = vabd_u8(d5u8, d16u8); + d24u8 = vqadd_u8(d24u8, d24u8); + + d19u8 = vcge_u8(dlimit, d19u8); + + d25u8 = vmax_u8(d25u8, d26u8); + d26u8 = vmax_u8(d27u8, d28u8); + + d23u8 = vshr_n_u8(d23u8, 1); + + d25u8 = vmax_u8(d25u8, d26u8); + + d24u8 = vqadd_u8(d24u8, d23u8); + + d20u8 = vmax_u8(d20u8, d25u8); + + d23u8 = vdup_n_u8(1); + d24u8 = vcge_u8(dblimit, d24u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + + d20u8 = vcge_u8(d23u8, d20u8); + + d19u8 = vand_u8(d19u8, d24u8); + + d23u8 = vcgt_u8(d22u8, dthresh); + + d20u8 = vand_u8(d20u8, d19u8); + + d22u8 = vdup_n_u8(0x80); + + d23u8 = vorr_u8(d21u8, d23u8); + + q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); + + d30u8 = vshrn_n_u16(q10u16, 4); + flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); + + if (flat == 0xffffffff) { // Check for all 1's, power_branch_only + d27u8 = vdup_n_u8(3); + d21u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d21u8); + q14u16 = vaddw_u8(q14u16, d5u8); + *d0ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + *d1ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + *d2ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d3ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d4ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d5ru8 = vqrshrn_n_u16(q14u16, 3); + } else { + d21u8 = veor_u8(d7u8, d22u8); + d24u8 = veor_u8(d6u8, d22u8); + d25u8 = veor_u8(d5u8, d22u8); + d26u8 = veor_u8(d16u8, d22u8); + + d27u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); + d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); + + q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); + + d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + q15s16 = vaddw_s8(q15s16, d29s8); + + d29u8 = vdup_n_u8(4); + + d28s8 = vqmovn_s16(q15s16); + + d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); + d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); + d30s8 = vshr_n_s8(d30s8, 3); + d29s8 = vshr_n_s8(d29s8, 3); + + d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); + d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); + + d29s8 = vrshr_n_s8(d29s8, 1); + d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); + + if (flat == 0) { // filter_branch_only + *d0ru8 = d4u8; + *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + *d5ru8 = d17u8; + return; + } + + d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + + d23u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d23u8); + + d0u8 = vbsl_u8(d20u8, dblimit, d4u8); + + q14u16 = vaddw_u8(q14u16, d5u8); + + d1u8 = vbsl_u8(d20u8, dlimit, d25u8); + + d30u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d2u8 = vbsl_u8(d20u8, dthresh, d24u8); + + d31u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + + *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); + + d23u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + + *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); + + d22u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d3u8 = vbsl_u8(d20u8, d3u8, d21u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + d4u8 = vbsl_u8(d20u8, d4u8, d26u8); + + d6u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + + d5u8 = vbsl_u8(d20u8, d5u8, d17u8); + + d7u8 = vqrshrn_n_u16(q14u16, 3); + + *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); + *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); + *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); + } + return; +} + +void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, + &d5u8); + + s -= (pitch * 6); + vst1_u8(s, d0u8); + s += pitch; + vst1_u8(s, d1u8); + s += pitch; + vst1_u8(s, d2u8); + s += pitch; + vst1_u8(s, d3u8); + s += pitch; + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + } + return; +} + +void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + uint8x8x2_t d2Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + for (i = 0; i < 1; i++) { + s = src + (i * (pitch << 3)) - 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, + &d5u8); + + d4Result.val[0] = d0u8; + d4Result.val[1] = d1u8; + d4Result.val[2] = d2u8; + d4Result.val[3] = d3u8; + + d2Result.val[0] = d4u8; + d2Result.val[1] = d5u8; + + s = src - 3; + vst4_lane_u8(s, d4Result, 0); + s += pitch; + vst4_lane_u8(s, d4Result, 1); + s += pitch; + vst4_lane_u8(s, d4Result, 2); + s += pitch; + vst4_lane_u8(s, d4Result, 3); + s += pitch; + vst4_lane_u8(s, d4Result, 4); + s += pitch; + vst4_lane_u8(s, d4Result, 5); + s += pitch; + vst4_lane_u8(s, d4Result, 6); + s += pitch; + vst4_lane_u8(s, d4Result, 7); + + s = src + 1; + vst2_lane_u8(s, d2Result, 0); + s += pitch; + vst2_lane_u8(s, d2Result, 1); + s += pitch; + vst2_lane_u8(s, d2Result, 2); + s += pitch; + vst2_lane_u8(s, d2Result, 3); + s += pitch; + vst2_lane_u8(s, d2Result, 4); + s += pitch; + vst2_lane_u8(s, d2Result, 5); + s += pitch; + vst2_lane_u8(s, d2Result, 6); + s += pitch; + vst2_lane_u8(s, d2Result, 7); + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm new file mode 100644 index 000000000..675928860 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm @@ -0,0 +1,638 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_edge_8_neon| + EXPORT |aom_lpf_horizontal_edge_16_neon| + EXPORT |aom_lpf_vertical_16_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; void mb_lpf_horizontal_edge(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; r12 int count +|mb_lpf_horizontal_edge| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + +h_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines + + vld1.u8 {d0}, [r8@64], r1 ; p7 + vld1.u8 {d1}, [r8@64], r1 ; p6 + vld1.u8 {d2}, [r8@64], r1 ; p5 + vld1.u8 {d3}, [r8@64], r1 ; p4 + vld1.u8 {d4}, [r8@64], r1 ; p3 + vld1.u8 {d5}, [r8@64], r1 ; p2 + vld1.u8 {d6}, [r8@64], r1 ; p1 + vld1.u8 {d7}, [r8@64], r1 ; p0 + vld1.u8 {d8}, [r8@64], r1 ; q0 + vld1.u8 {d9}, [r8@64], r1 ; q1 + vld1.u8 {d10}, [r8@64], r1 ; q2 + vld1.u8 {d11}, [r8@64], r1 ; q3 + vld1.u8 {d12}, [r8@64], r1 ; q4 + vld1.u8 {d13}, [r8@64], r1 ; q5 + vld1.u8 {d14}, [r8@64], r1 ; q6 + vld1.u8 {d15}, [r8@64], r1 ; q7 + + bl aom_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8@64], r1 ; store op1 + vst1.u8 {d24}, [r8@64], r1 ; store op0 + vst1.u8 {d23}, [r8@64], r1 ; store oq0 + vst1.u8 {d26}, [r8@64], r1 ; store oq1 + + b h_next + +h_mbfilter + tst r7, #2 + beq h_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8@64], r1 ; store op2 + vst1.u8 {d19}, [r8@64], r1 ; store op1 + vst1.u8 {d20}, [r8@64], r1 ; store op0 + vst1.u8 {d21}, [r8@64], r1 ; store oq0 + vst1.u8 {d22}, [r8@64], r1 ; store oq1 + vst1.u8 {d23}, [r8@64], r1 ; store oq2 + + b h_next + +h_wide_mbfilter + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8@64], r1 ; store op6 + vst1.u8 {d24}, [r8@64], r1 ; store op5 + vst1.u8 {d25}, [r8@64], r1 ; store op4 + vst1.u8 {d26}, [r8@64], r1 ; store op3 + vst1.u8 {d27}, [r8@64], r1 ; store op2 + vst1.u8 {d18}, [r8@64], r1 ; store op1 + vst1.u8 {d19}, [r8@64], r1 ; store op0 + vst1.u8 {d20}, [r8@64], r1 ; store oq0 + vst1.u8 {d21}, [r8@64], r1 ; store oq1 + vst1.u8 {d22}, [r8@64], r1 ; store oq2 + vst1.u8 {d23}, [r8@64], r1 ; store oq3 + vst1.u8 {d1}, [r8@64], r1 ; store oq4 + vst1.u8 {d2}, [r8@64], r1 ; store oq5 + vst1.u8 {d3}, [r8@64], r1 ; store oq6 + +h_next + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |mb_lpf_horizontal_edge| + +; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|aom_lpf_horizontal_edge_8_neon| PROC + mov r12, #1 + b mb_lpf_horizontal_edge + ENDP ; |aom_lpf_horizontal_edge_8_neon| + +; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|aom_lpf_horizontal_edge_16_neon| PROC + mov r12, #2 + b mb_lpf_horizontal_edge + ENDP ; |aom_lpf_horizontal_edge_16_neon| + +; void aom_lpf_vertical_16_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_16_neon| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8@64], r1 + vld1.8 {d8}, [r0@64], r1 + vld1.8 {d1}, [r8@64], r1 + vld1.8 {d9}, [r0@64], r1 + vld1.8 {d2}, [r8@64], r1 + vld1.8 {d10}, [r0@64], r1 + vld1.8 {d3}, [r8@64], r1 + vld1.8 {d11}, [r0@64], r1 + vld1.8 {d4}, [r8@64], r1 + vld1.8 {d12}, [r0@64], r1 + vld1.8 {d5}, [r8@64], r1 + vld1.8 {d13}, [r0@64], r1 + vld1.8 {d6}, [r8@64], r1 + vld1.8 {d14}, [r0@64], r1 + vld1.8 {d7}, [r8@64], r1 + vld1.8 {d15}, [r0@64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl aom_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter + tst r7, #2 + beq v_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8@64], r1 + vst1.8 {d20}, [r0@64], r1 + vst1.8 {d16}, [r8@64], r1 + vst1.8 {d21}, [r0@64], r1 + vst1.8 {d24}, [r8@64], r1 + vst1.8 {d22}, [r0@64], r1 + vst1.8 {d25}, [r8@64], r1 + vst1.8 {d23}, [r0@64], r1 + vst1.8 {d26}, [r8@64], r1 + vst1.8 {d1}, [r0@64], r1 + vst1.8 {d27}, [r8@64], r1 + vst1.8 {d2}, [r0@64], r1 + vst1.8 {d18}, [r8@64], r1 + vst1.8 {d3}, [r0@64], r1 + vst1.8 {d19}, [r8@64], r1 + vst1.8 {d15}, [r0@64], r1 + +v_end + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |aom_lpf_vertical_16_neon| + +; void aom_wide_mbfilter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. +; +; r0-r3 PRESERVE +; d16 blimit +; d17 limit +; d18 thresh +; d0 p7 +; d1 p6 +; d2 p5 +; d3 p4 +; d4 p3 +; d5 p2 +; d6 p1 +; d7 p0 +; d8 q0 +; d9 q1 +; d10 q2 +; d11 q3 +; d12 q4 +; d13 q5 +; d14 q6 +; d15 q7 +|aom_wide_mbfilter_neon| PROC + mov r7, #0 + + ; filter_mask + vabd.u8 d19, d4, d5 ; abs(p3 - p2) + vabd.u8 d20, d5, d6 ; abs(p2 - p1) + vabd.u8 d21, d6, d7 ; abs(p1 - p0) + vabd.u8 d22, d9, d8 ; abs(q1 - q0) + vabd.u8 d23, d10, d9 ; abs(q2 - q1) + vabd.u8 d24, d11, d10 ; abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 ; abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d17, d19 + + ; flatmask4 + vabd.u8 d25, d7, d5 ; abs(p0 - p2) + vabd.u8 d26, d8, d10 ; abs(q0 - q2) + vabd.u8 d27, d4, d7 ; abs(p3 - p0) + vabd.u8 d28, d11, d8 ; abs(q3 - q0) + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 ; a = a / 2 + vqadd.u8 d24, d24, d23 ; a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 ; flat + + vand d19, d19, d24 ; mask + + ; hevmask + vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 ; hev + + vand d16, d20, d19 ; flat && mask + vmov r5, r6, d16 + + ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 ; abs(p4 - p0) + vabd.u8 d23, d12, d8 ; abs(q4 - q0) + vabd.u8 d24, d7, d2 ; abs(p0 - p5) + vabd.u8 d25, d8, d13 ; abs(q0 - q5) + vabd.u8 d26, d1, d7 ; abs(p6 - p0) + vabd.u8 d27, d14, d8 ; abs(q6 - q0) + vabd.u8 d28, d0, d7 ; abs(p7 - p0) + vabd.u8 d29, d15, d8 ; abs(q7 - q0) + + ; only compare the largest value to thresh + vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 ; flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #1 ; Only do filter branch + + vand d17, d18, d16 ; flat2 && flat && mask + vmov r5, r6, d17 + + ; mbfilter() function + + ; filter() function + ; convert to signed + veor d23, d8, d22 ; qs0 + veor d24, d7, d22 ; ps0 + veor d25, d6, d22 ; ps1 + veor d26, d9, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 ; ( qs0 - ps0) + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + vand d29, d29, d21 ; filter &= hev + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d23, d23, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + + ; mbfilter flat && mask branch + ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's + ; and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 + vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 ; r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 ; r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 ; r_op0 + + vsubw.u8 q15, d4 ; oq0 = op0 - p3 + vsubw.u8 q15, d7 ; oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 ; r_oq0 + + vsubw.u8 q15, d5 ; oq1 = oq0 - p2 + vsubw.u8 q15, d8 ; oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 ; r_oq1 + + vsubw.u8 q15, d6 ; oq2 = oq0 - p1 + vsubw.u8 q15, d9 ; oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 ; r_oq2 + + ; Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + ; wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 ; w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 ; w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 ; w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 ; w_op3 + + vaddw.u8 q15, q14, d5 ; op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 ; op2 += q4 + vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 ; w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 ; op1 += p1 + vaddw.u8 q15, d13 ; op1 += q5 + vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 ; w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 ; op0 += p0 + vaddw.u8 q15, d14 ; op0 += q6 + vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 ; w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 ; oq0 += q0 + vaddw.u8 q15, d15 ; oq0 += q7 + vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 ; w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 ; oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 ; oq1 += q7 + vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 ; w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 ; w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 ; w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 ; w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 ; w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 ; w_oq6 + vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) + + bx lr + ENDP ; |aom_wide_mbfilter_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c new file mode 100644 index 000000000..c90d6bfde --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" +#include "aom/aom_integer.h" + +void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); +} + +#if HAVE_NEON_ASM +void aom_lpf_horizontal_8_dual_neon( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh); + aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); +} +#endif // HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c new file mode 100644 index 000000000..a1eeaf4b7 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, +// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo +// and vec_sum_ref_hi. +static void sad_neon_64(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, + const uint8x16_t vec_src_32, + const uint8x16_t vec_src_48, const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); +} + +// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, +// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. +static void sad_neon_32(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); +} + +void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, + &vec_sum_ref0_lo, &vec_sum_ref0_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, + &vec_sum_ref1_lo, &vec_sum_ref1_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, + &vec_sum_ref2_lo, &vec_sum_ref2_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, + &vec_sum_ref3_lo, &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + + sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, + &vec_sum_ref0_hi); + sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, + &vec_sum_ref1_hi); + sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, + &vec_sum_ref2_hi); + sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, + &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref0 = vld1q_u8(ref0); + const uint8x16_t vec_ref1 = vld1q_u8(ref1); + const uint8x16_t vec_ref2 = vld1q_u8(ref2); + const uint8x16_t vec_ref3 = vld1q_u8(ref3); + + vec_sum_ref0_lo = + vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); + vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref0)); + vec_sum_ref1_lo = + vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); + vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref1)); + vec_sum_ref2_lo = + vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); + vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref2)); + vec_sum_ref3_lo = + vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); + vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref3)); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} diff --git a/third_party/aom/aom_dsp/arm/sad_media.asm b/third_party/aom/aom_dsp/arm/sad_media.asm new file mode 100644 index 000000000..49ddb6764 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad_media.asm @@ -0,0 +1,98 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_sad16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 const unsigned char *src_ptr +; r1 int src_stride +; r2 const unsigned char *ref_ptr +; r3 int ref_stride +|aom_sad16x16_media| PROC + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + mov r4, #0 ; sad = 0; + mov r5, #8 ; loop count + +loop + ; 1st row + ldr r6, [r0, #0x0] ; load 4 src pixels (1A) + ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) + ldr r7, [r0, #0x4] ; load 4 src pixels (1A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) + ldr r10, [r0, #0x8] ; load 4 src pixels (1B) + ldr r11, [r0, #0xC] ; load 4 src pixels (1B) + + usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + ldr r6, [r0, #0x0] ; load 4 src pixels (2A) + ldr r7, [r0, #0x4] ; load 4 src pixels (2A) + add r4, r4, r8 ; add partial sad values + + ; 2nd row + ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) + ldr r10, [r0, #0x8] ; load 4 src pixels (2B) + ldr r11, [r0, #0xC] ; load 4 src pixels (2B) + + usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + subs r5, r5, #1 ; decrement loop counter + add r4, r4, r8 ; add partial sad values + + bne loop + + mov r0, r4 ; return sad + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c new file mode 100644 index 000000000..2f452f55b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad_neon.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" + +#include "aom/aom_integer.h" + +unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 15; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 3; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); + + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); +} + +unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 7; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} +static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); + } + return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); +} + +unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref = vld1q_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum_lo = + vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); + vec_accum_hi = + vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum = vdupq_n_u16(0); + + for (i = 0; i < 8; ++i) { + const uint8x8_t vec_src = vld1_u8(src); + const uint8x8_t vec_ref = vld1_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + } + return horizontal_add_16x8(vec_accum); +} diff --git a/third_party/aom/aom_dsp/arm/save_reg_neon.asm b/third_party/aom/aom_dsp/arm/save_reg_neon.asm new file mode 100644 index 000000000..e04969823 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/save_reg_neon.asm @@ -0,0 +1,39 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_push_neon| + EXPORT |aom_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|aom_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|aom_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_media.c b/third_party/aom/aom_dsp/arm/subpel_variance_media.c new file mode 100644 index 000000000..46ec028d3 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subpel_variance_media.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#if HAVE_MEDIA +static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 }, + { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, + { 32, 96 }, { 16, 112 } }; + +extern void aom_filter_block2d_bil_first_pass_media( + const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch, + uint32_t height, uint32_t width, const int16_t *filter); + +extern void aom_filter_block2d_bil_second_pass_media( + const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch, + uint32_t height, uint32_t width, const int16_t *filter); + +unsigned int aom_sub_pixel_variance8x8_media( + const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, + const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { + uint16_t first_pass[10 * 8]; + uint8_t second_pass[8 * 8]; + const int16_t *HFilter, *VFilter; + + HFilter = bilinear_filters_media[xoffset]; + VFilter = bilinear_filters_media[yoffset]; + + aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass, + src_pixels_per_line, 9, 8, HFilter); + aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8, + VFilter); + + return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line, + sse); +} + +unsigned int aom_sub_pixel_variance16x16_media( + const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, + const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { + uint16_t first_pass[36 * 16]; + uint8_t second_pass[20 * 16]; + const int16_t *HFilter, *VFilter; + unsigned int var; + + if (xoffset == 4 && yoffset == 0) { + var = aom_variance_halfpixvar16x16_h_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + } else if (xoffset == 0 && yoffset == 4) { + var = aom_variance_halfpixvar16x16_v_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + } else if (xoffset == 4 && yoffset == 4) { + var = aom_variance_halfpixvar16x16_hv_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + } else { + HFilter = bilinear_filters_media[xoffset]; + VFilter = bilinear_filters_media[yoffset]; + + aom_filter_block2d_bil_first_pass_media( + src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter); + aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16, + 16, VFilter); + + var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line, + sse); + } + return var; +} +#endif // HAVE_MEDIA diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c new file mode 100644 index 000000000..064b72d6f --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/variance.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 16) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); + DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); + + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, + bilinear_filters[yoffset]); + return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); + DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, + bilinear_filters[yoffset]); + return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); + DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, + bilinear_filters[yoffset]); + return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); + DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, + bilinear_filters[yoffset]); + return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse); +} diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c new file mode 100644 index 000000000..cb8a2daf8 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subtract_neon.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +void aom_subtract_block_neon(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r, c; + + if (cols > 16) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; c += 32) { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = + vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = + vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = + vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = + vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 8) { + for (r = 0; r < rows; ++r) { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = + vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = + vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 4) { + for (r = 0; r < rows; ++r) { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } +} diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm new file mode 100644 index 000000000..1e5c9178e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm @@ -0,0 +1,185 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance_halfpixvar16x16_h_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance_halfpixvar16x16_h_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm new file mode 100644 index 000000000..9e0af830e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm @@ -0,0 +1,225 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance_halfpixvar16x16_hv_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance_halfpixvar16x16_hv_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; pointer to pixels on the next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load source pixels a, row N + ldr r6, [r0, #1] ; load source pixels b, row N + ldr r5, [r9, #0] ; load source pixels c, row N+1 + ldr r7, [r9, #1] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #0] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load source pixels a, row N + ldr r6, [r0, #5] ; load source pixels b, row N + ldr r5, [r9, #4] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #5] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #4] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load source pixels a, row N + ldr r6, [r0, #9] ; load source pixels b, row N + ldr r5, [r9, #8] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #9] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #8] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load source pixels a, row N + ldr r6, [r0, #13] ; load source pixels b, row N + ldr r5, [r9, #12] ; load source pixels c, row N+1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + ldr r7, [r9, #13] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #12] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm new file mode 100644 index 000000000..545b68179 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm @@ -0,0 +1,187 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance_halfpixvar16x16_v_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance_halfpixvar16x16_v_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; set src pointer to next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r9, #0] ; load 4 src pixels from next row + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r9, #4] ; load 4 src pixels from next row + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r9, #8] ; load 4 src pixels from next row + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r9, #12] ; load 4 src pixels from next row + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/third_party/aom/aom_dsp/arm/variance_media.asm b/third_party/aom/aom_dsp/arm/variance_media.asm new file mode 100644 index 000000000..fdc311a81 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_media.asm @@ -0,0 +1,361 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance16x16_media| + EXPORT |aom_variance8x8_media| + EXPORT |aom_mse16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance16x16_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop16x16 + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop16x16 + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance8x8_media| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop8x8 + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop8x8 + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +; +;note: Based on aom_variance16x16_media. In this function, sum is never used. +; So, we can remove this part of calculation. + +|aom_mse16x16_media| PROC + + push {r4-r9, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) + mov r4, #0 ; initialize sse = 0 + +loopmse + ; 1st 4 pixels + ldr r5, [r0, #0x0] ; load 4 src pixels + ldr r6, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0x4] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r2, #0x4] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + ldr r5, [r0, #0x8] ; load 4 src pixels + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r6, [r2, #0x8] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0xc] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r6, [r2, #0xc] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + subs r12, r12, #1 ; next row + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + bne loopmse + + ; return stuff + ldr r1, [sp, #28] ; get address of sse + mov r0, r4 ; return sse + str r4, [r1] ; store sse + + pop {r4-r9, pc} + + ENDP + + END diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c new file mode 100644 index 000000000..dbab287e3 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_neon.c @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +// w * h must be less than 2048 or local variable v_sum may overflow. +static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = + vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); + v_sse_hi = + vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +} + +void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); +} + +unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); + return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8); +} + +unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, + 32, 32, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); +} + +unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); +} + +unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), + b_stride, 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), + b_stride, 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); +} + +unsigned int aom_variance16x8_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int aom_variance8x16_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c new file mode 100644 index 000000000..eb6059705 --- /dev/null +++ b/third_party/aom/aom_dsp/avg.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + for (i = 0; i < 8; ++i, src += stride) + for (j = 0; j < 8; sum += src[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 6); +} + +unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + for (i = 0; i < 4; ++i, src += stride) + for (j = 0; j < 4; sum += src[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 4); +} + +// src_diff: first pass, 9 bit, dynamic range [-255, 255] +// second pass, 12 bit, dynamic range [-2040, 2040] +static void hadamard_col8(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int idx; + int16_t buffer[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + coeff += 8; // coeff: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } +} + +// In place 16x16 2D Hadamard transform +void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 64; ++idx) { + int16_t a0 = coeff[0]; + int16_t a1 = coeff[64]; + int16_t a2 = coeff[128]; + int16_t a3 = coeff[192]; + + int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + int16_t b3 = (a2 - a3) >> 1; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int aom_satd_c(const int16_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + return satd; +} + +// Integer projection onto row vectors. +// height: value range {16, 32, 64}. +void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, int ref_stride, + int height) { + int idx; + const int norm_factor = height >> 1; + for (idx = 0; idx < 16; ++idx) { + int i; + hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 16320]. + for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 510]. + hbuf[idx] /= norm_factor; + ++ref; + } +} + +// width: value range {16, 32, 64}. +int16_t aom_int_pro_col_c(const uint8_t *ref, int width) { + int idx; + int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 16320] + for (idx = 0; idx < width; ++idx) sum += ref[idx]; + return sum; +} + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4} +int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) { + int i; + int width = 4 << bwl; + int sse = 0, mean = 0, var; + + for (i = 0; i < width; ++i) { + int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. + mean += diff; // mean: dynamic range 16 bits. + sse += diff * diff; // sse: dynamic range 26 bits. + } + + // (mean * mean): dynamic range 31 bits. + var = sse - ((mean * mean) >> (bwl + 2)); + return var; +} + +void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) { + for (j = 0; j < 8; ++j) { + int diff = abs(src[j] - ref[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} + +#if CONFIG_HIGHBITDEPTH +unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(src); + for (i = 0; i < 8; ++i, s += stride) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 6); +} + +unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(src); + for (i = 0; i < 4; ++i, s += stride) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 4); +} + +void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + int i, j; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const uint16_t *d = CONVERT_TO_SHORTPTR(d8); + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c new file mode 100644 index 000000000..96c4cb436 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitreader.h" + +#include "av1/common/common.h" + +// Inverse recenters a non-negative literal v around a reference r +static uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if ((v & 1) == 0) + return (v >> 1) + r; + else + return r - ((v + 1) >> 1); +} + +// Inverse recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { + if ((r << 1) <= n) { + return inv_recenter_nonneg(r, v); + } else { + return n - 1 - inv_recenter_nonneg(n - 1 - r, v); + } +} + +int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits) { + if (aom_read_bit(r, NULL)) { + int s = aom_read_bit(r, NULL); + int16_t x = aom_read_literal(r, mag_bits, NULL) + 1; + return (s > 0 ? -x : x); + } else { + return 0; + } +} + +uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n) { + if (n <= 1) return 0; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + const int v = aom_read_literal(r, l - 1, NULL); + return v < m ? v : (v << 1) - m + aom_read_bit(r, NULL); +} + +uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, + uint16_t ref) { + if (n <= 1) return 0; + assert(p > 0 && p <= n); + assert(ref < n); + int lolimit = ref - p / 2; + const int hilimit = lolimit + p - 1; + if (lolimit < 0) { + lolimit = 0; + } else if (hilimit >= n) { + lolimit = n - p; + } + int v; + if (aom_read_bit(r, NULL)) { + v = aom_read_primitive_quniform(r, p) + lolimit; + } else { + v = aom_read_primitive_quniform(r, n - p); + if (v >= lolimit) v += p; + } + return v; +} + +// Decode finite subexponential code that for a symbol v in [0, n-1] with +// parameter k +uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k) { + int i = 0; + int mk = 0; + uint16_t v; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + v = aom_read_primitive_quniform(r, n - mk) + mk; + break; + } else { + if (aom_read_bit(r, NULL)) { + i = i + 1; + mk += a; + } else { + v = aom_read_literal(r, b, NULL) + mk; + break; + } + } + } + return v; +} + +// Decode finite subexponential code that for a symbol v in [0, n-1] with +// parameter k +// based on a reference ref also in [0, n-1]. +uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref) { + return inv_recenter_finite_nonneg(n, ref, + aom_read_primitive_subexpfin(r, n, k)); +} + +// Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with +// parameter k based on a reference ref also in [-(n-1), n-1]. +int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n, + uint16_t k, int16_t ref) { + ref += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref) - n + 1; +} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h new file mode 100644 index 000000000..738d91da8 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BINARY_CODES_READER_H_ +#define AOM_DSP_BINARY_CODES_READER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader.h" + +int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits); + +uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n); +uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, + uint16_t ref); +uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k); +uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref); +int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n, + uint16_t k, int16_t ref); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BINARY_CODES_READER_H_ diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c new file mode 100644 index 000000000..91e807b29 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitwriter.h" + +#include "av1/common/common.h" + +// Recenters a non-negative literal v around a reference r +static uint16_t recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if (v >= r) + return ((v - r) << 1); + else + return ((r - v) << 1) - 1; +} + +// Recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { + if ((r << 1) <= n) { + return recenter_nonneg(r, v); + } else { + return recenter_nonneg(n - 1 - r, n - 1 - v); + } +} + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits]. +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int abs_bits) { + if (v == 0) { + aom_write_bit(w, 0); + } else { + const int x = abs(v); + const int s = v < 0; + aom_write_bit(w, 1); + aom_write_bit(w, s); + aom_write_literal(w, x - 1, abs_bits); + } +} + +int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) { + return (v == 0 ? 1 : abs_bits + 2); +} + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_bit(w, (v - m) & 1); + } +} + +int aom_count_primitive_quniform(uint16_t n, uint16_t v) { + if (n <= 1) return 0; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + return v < m ? l - 1 : l; +} + +// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1] +// The closest p values of v from ref are coded using a p-ary quasi-unoform +// short code while the remaining n-p values are coded with a longer code. +void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p, + uint16_t ref, uint16_t v) { + if (n <= 1) return; + assert(p > 0 && p <= n); + assert(ref < n); + int lolimit = ref - p / 2; + int hilimit = lolimit + p - 1; + if (lolimit < 0) { + lolimit = 0; + hilimit = p - 1; + } else if (hilimit >= n) { + hilimit = n - 1; + lolimit = n - p; + } + if (v >= lolimit && v <= hilimit) { + aom_write_bit(w, 1); + v = v - lolimit; + aom_write_primitive_quniform(w, p, v); + } else { + aom_write_bit(w, 0); + if (v > hilimit) v -= p; + aom_write_primitive_quniform(w, n - p, v); + } +} + +int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref, + uint16_t v) { + if (n <= 1) return 0; + assert(p > 0 && p <= n); + assert(ref < n); + int lolimit = ref - p / 2; + int hilimit = lolimit + p - 1; + if (lolimit < 0) { + lolimit = 0; + hilimit = p - 1; + } else if (hilimit >= n) { + hilimit = n - 1; + lolimit = n - p; + } + int count = 0; + if (v >= lolimit && v <= hilimit) { + count++; + v = v - lolimit; + count += aom_count_primitive_quniform(p, v); + } else { + count++; + if (v > hilimit) v -= p; + count += aom_count_primitive_quniform(n - p, v); + } + return count; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + aom_write_primitive_quniform(w, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_write_bit(w, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_write_literal(w, v - mk, b); + break; + } + } + } +} + +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { + int count = 0; + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + count += aom_count_primitive_quniform(n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + count++; + if (t) { + i = i + 1; + mk += a; + } else { + count += b; + break; + } + } + } + return count; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +// Recenters symbol around r first and then uses a finite subexponential code. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + int16_t ref, int16_t v) { + aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); +} + +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, uint16_t ref, + uint16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v); +} + +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v) { + return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); +} + +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); +} diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h new file mode 100644 index 000000000..ab5ccbf15 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BINARY_CODES_WRITER_H_ +#define AOM_DSP_BINARY_CODES_WRITER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitwriter.h" + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits] +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int mag_bits); + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v); + +// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1] +// The closest p values of v from ref are coded using a p-ary quasi-unoform +// short code while the remaining n-p values are coded with a longer code. +void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p, + uint16_t ref, uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t ref, uint16_t v); + +// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with +// parameter k based on a reference ref also in [-(n-1), n-1]. +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, int16_t ref, + int16_t v); + +// Functions that counts bits for the above primitives +int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); +int aom_count_primitive_quniform(uint16_t n, uint16_t v); +int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref, + uint16_t v); +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v); +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v); +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BINARY_CODES_WRITER_H_ diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h new file mode 100644 index 000000000..9cd34dd48 --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITREADER_H_ +#define AOM_DSP_BITREADER_H_ + +#include +#include + +#include "./aom_config.h" +#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL +#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL." +#endif + +#include "aom/aomdx.h" +#include "aom/aom_integer.h" +#if CONFIG_ANS +#include "aom_dsp/ansreader.h" +#elif CONFIG_DAALA_EC +#include "aom_dsp/daalaboolreader.h" +#else +#include "aom_dsp/dkboolreader.h" +#endif +#include "aom_dsp/prob.h" +#include "av1/common/odintrin.h" + +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#define ACCT_STR_NAME acct_str +#define ACCT_STR_PARAM , const char *ACCT_STR_NAME +#define ACCT_STR_ARG(s) , s +#else +#define ACCT_STR_PARAM +#define ACCT_STR_ARG(s) +#endif + +#define aom_read(r, prob, ACCT_STR_NAME) \ + aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_bit(r, ACCT_STR_NAME) \ + aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \ + aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_literal(r, bits, ACCT_STR_NAME) \ + aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_ANS +typedef struct AnsDecoder aom_reader; +#elif CONFIG_DAALA_EC +typedef struct daala_reader aom_reader; +#else +typedef struct aom_dk_reader aom_reader; +#endif + +static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer, + size_t size, aom_decrypt_cb decrypt_cb, + void *decrypt_state) { +#if CONFIG_ANS + (void)decrypt_cb; + (void)decrypt_state; + if (size > INT_MAX) return 1; + return ans_read_init(r, buffer, (int)size); +#elif CONFIG_DAALA_EC + (void)decrypt_cb; + (void)decrypt_state; + return aom_daala_reader_init(r, buffer, (int)size); +#else + return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state); +#endif +} + +static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) { +#if CONFIG_ANS + (void)r; + assert(0 && "Use the raw buffer size with ANS"); + return NULL; +#elif CONFIG_DAALA_EC + return aom_daala_reader_find_end(r); +#else + return aom_dk_reader_find_end(r); +#endif +} + +static INLINE int aom_reader_has_error(aom_reader *r) { +#if CONFIG_ANS + return ans_reader_has_error(r); +#elif CONFIG_DAALA_EC + return aom_daala_reader_has_error(r); +#else + return aom_dk_reader_has_error(r); +#endif +} + +// Returns the position in the bit reader in bits. +static INLINE uint32_t aom_reader_tell(const aom_reader *r) { +#if CONFIG_ANS + (void)r; + assert(0 && "aom_reader_tell() is unimplemented for ANS"); + return 0; +#elif CONFIG_DAALA_EC + return aom_daala_reader_tell(r); +#else + return aom_dk_reader_tell(r); +#endif +} + +// Returns the position in the bit reader in 1/8th bits. +static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) { +#if CONFIG_ANS + (void)r; + assert(0 && "aom_reader_tell_frac() is unimplemented for ANS"); + return 0; +#elif CONFIG_DAALA_EC + return aom_daala_reader_tell_frac(r); +#else + return aom_dk_reader_tell_frac(r); +#endif +} + +#if CONFIG_ACCOUNTING +static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) { + if (r->accounting != NULL) { + uint32_t tell_frac; + tell_frac = aom_reader_tell_frac(r); + aom_accounting_record(r->accounting, ACCT_STR_NAME, + tell_frac - r->accounting->last_tell_frac); + r->accounting->last_tell_frac = tell_frac; + } +} + +static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) { + if (r->accounting != NULL) { + r->accounting->syms.num_multi_syms += !is_binary; + r->accounting->syms.num_binary_syms += !!is_binary; + } +} +#endif + +static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { + int ret; +#if CONFIG_ANS + ret = rabs_read(r, prob); +#elif CONFIG_DAALA_EC + ret = aom_daala_read(r, prob); +#else + ret = aom_dk_read(r, prob); +#endif +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, 1); +#endif + return ret; +} + +static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { + int ret; +#if CONFIG_ANS + ret = rabs_read_bit(r); // Non trivial optimization at half probability +#elif CONFIG_DAALA_EC && CONFIG_RAWBITS + // Note this uses raw bits and is not the same as aom_daala_read(r, 128); + // Calls to this function are omitted from raw symbol accounting. + ret = aom_daala_read_bit(r); +#else + ret = aom_read(r, 128, NULL); // aom_prob_half +#endif +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return ret; +} + +static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit; +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return literal; +} + +static INLINE int aom_read_tree_as_bits(aom_reader *r, + const aom_tree_index *tree, + const aom_prob *probs) { + aom_tree_index i = 0; + + while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue; + return -i; +} + +#if CONFIG_EC_MULTISYMBOL +static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; +#if CONFIG_ANS + (void)nsymbs; + ret = rans_read(r, cdf); +#elif CONFIG_DAALA_EC + ret = daala_read_symbol(r, cdf, nsymbs); +#else +#error \ + "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \ + "coder. Enable daala_ec or ans for a valid configuration." +#endif + +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, (nsymbs == 2)); +#endif + return ret; +} + +static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; + ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); +#if CONFIG_EC_ADAPT + update_cdf(cdf, ret, nsymbs); +#endif + return ret; +} + +static INLINE int aom_read_tree_as_cdf(aom_reader *r, + const aom_tree_index *tree, + const aom_prob *probs) { + aom_tree_index i = 0; + do { + aom_cdf_prob cdf[16]; + aom_tree_index index[16]; + int path[16]; + int dist[16]; + int nsymbs; + int symb; + nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist); + symb = aom_read_cdf(r, cdf, nsymbs, NULL); + OD_ASSERT(symb >= 0 && symb < nsymbs); + i = index[symb]; + } while (i > 0); + return -i; +} +#endif // CONFIG_EC_MULTISYMBOL + +static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree, + const aom_prob *probs ACCT_STR_PARAM) { + int ret; +#if CONFIG_EC_MULTISYMBOL + ret = aom_read_tree_as_cdf(r, tree, probs); +#else + ret = aom_read_tree_as_bits(r, tree, probs); +#endif +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return ret; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITREADER_H_ diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c new file mode 100644 index 000000000..009682b4c --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "./aom_config.h" +#include "./bitreader_buffer.h" + +size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) { + return (rb->bit_offset + 7) >> 3; +} + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { + const uint32_t off = rb->bit_offset; + const uint32_t p = off >> 3; + const int q = 7 - (int)(off & 0x7); + if (rb->bit_buffer + p < rb->bit_buffer_end) { + const int bit = (rb->bit_buffer[p] >> q) & 1; + rb->bit_offset = off + 1; + return bit; + } else { + rb->error_handler(rb->error_handler_data); + return 0; + } +} + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; + return value; +} + +int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) { + const int value = aom_rb_read_literal(rb, bits); + return aom_rb_read_bit(rb) ? -value : value; +} + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { + const int nbits = sizeof(unsigned) * 8 - bits - 1; + const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; + return ((int)value) >> nbits; +} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h new file mode 100644 index 000000000..22187357e --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITREADER_BUFFER_H_ +#define AOM_DSP_BITREADER_BUFFER_H_ + +#include + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*aom_rb_error_handler)(void *data); + +struct aom_read_bit_buffer { + const uint8_t *bit_buffer; + const uint8_t *bit_buffer_end; + uint32_t bit_offset; + + void *error_handler_data; + aom_rb_error_handler error_handler; +}; + +size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb); + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb); + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits); + +int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits); + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITREADER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h new file mode 100644 index 000000000..6e3fac260 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITWRITER_H_ +#define AOM_DSP_BITWRITER_H_ + +#include +#include "./aom_config.h" +#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL +#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL" +#endif + +#if CONFIG_ANS +#include "aom_dsp/buf_ans.h" +#elif CONFIG_DAALA_EC +#include "aom_dsp/daalaboolwriter.h" +#else +#include "aom_dsp/dkboolwriter.h" +#endif +#include "aom_dsp/prob.h" + +#if CONFIG_RD_DEBUG +#include "av1/common/blockd.h" +#include "av1/encoder/cost.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_ANS +typedef struct BufAnsCoder aom_writer; +#elif CONFIG_DAALA_EC +typedef struct daala_writer aom_writer; +#else +typedef struct aom_dk_writer aom_writer; +#endif + +typedef struct TOKEN_STATS { + int cost; +#if CONFIG_VAR_TX +#if CONFIG_RD_DEBUG + int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE]; +#endif +#endif +} TOKEN_STATS; + +static INLINE void init_token_stats(TOKEN_STATS *token_stats) { +#if CONFIG_VAR_TX +#if CONFIG_RD_DEBUG + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + token_stats->txb_coeff_cost_map[r][c] = 0; + } + } +#endif +#endif + token_stats->cost = 0; +} + +static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { +#if CONFIG_ANS + (void)bc; + (void)buffer; + assert(0 && "buf_ans requires a more complicated startup procedure"); +#elif CONFIG_DAALA_EC + aom_daala_start_encode(bc, buffer); +#else + aom_dk_start_encode(bc, buffer); +#endif +} + +static INLINE void aom_stop_encode(aom_writer *bc) { +#if CONFIG_ANS + (void)bc; + assert(0 && "buf_ans requires a more complicated shutdown procedure"); +#elif CONFIG_DAALA_EC + aom_daala_stop_encode(bc); +#else + aom_dk_stop_encode(bc); +#endif +} + +static INLINE void aom_write(aom_writer *br, int bit, int probability) { +#if CONFIG_ANS + buf_rabs_write(br, bit, probability); +#elif CONFIG_DAALA_EC + aom_daala_write(br, bit, probability); +#else + aom_dk_write(br, bit, probability); +#endif +} + +static INLINE void aom_write_record(aom_writer *br, int bit, int probability, + TOKEN_STATS *token_stats) { + aom_write(br, bit, probability); +#if CONFIG_RD_DEBUG + token_stats->cost += av1_cost_bit(probability, bit); +#else + (void)token_stats; +#endif +} + +static INLINE void aom_write_bit(aom_writer *w, int bit) { +#if CONFIG_ANS + buf_rabs_write_bit(w, bit); +#elif CONFIG_DAALA_EC && CONFIG_RAWBITS + // Note this uses raw bits and is not the same as aom_daala_write(r, 128); + aom_daala_write_bit(w, bit); +#else + aom_write(w, bit, 128); // aom_prob_half +#endif +} + +static INLINE void aom_write_bit_record(aom_writer *w, int bit, + TOKEN_STATS *token_stats) { + aom_write_bit(w, bit); +#if CONFIG_RD_DEBUG + token_stats->cost += av1_cost_bit(128, bit); // aom_prob_half +#else + (void)token_stats; +#endif +} + +static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { + int bit; + + for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit)); +} + +static INLINE void aom_write_tree_as_bits(aom_writer *w, + const aom_tree_index *tr, + const aom_prob *probs, int bits, + int len, aom_tree_index i) { + do { + const int bit = (bits >> --len) & 1; + aom_write(w, bit, probs[i >> 1]); + i = tr[i + bit]; + } while (len); +} + +static INLINE void aom_write_tree_as_bits_record( + aom_writer *w, const aom_tree_index *tr, const aom_prob *probs, int bits, + int len, aom_tree_index i, TOKEN_STATS *token_stats) { + do { + const int bit = (bits >> --len) & 1; + aom_write_record(w, bit, probs[i >> 1], token_stats); + i = tr[i + bit]; + } while (len); +} + +#if CONFIG_EC_MULTISYMBOL +static INLINE void aom_write_cdf(aom_writer *w, int symb, + const aom_cdf_prob *cdf, int nsymbs) { +#if CONFIG_ANS + (void)nsymbs; + assert(cdf); + const aom_cdf_prob cum_prob = symb > 0 ? cdf[symb - 1] : 0; + const aom_cdf_prob prob = cdf[symb] - cum_prob; + buf_rans_write(w, cum_prob, prob); +#elif CONFIG_DAALA_EC + daala_write_symbol(w, symb, cdf, nsymbs); +#else +#error \ + "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \ + "coder. Enable daala_ec or ans for a valid configuration." +#endif +} + +static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs) { + aom_write_cdf(w, symb, cdf, nsymbs); +#if CONFIG_EC_ADAPT + update_cdf(cdf, symb, nsymbs); +#endif +} + +static INLINE void aom_write_tree_as_cdf(aom_writer *w, + const aom_tree_index *tree, + const aom_prob *probs, int bits, + int len, aom_tree_index i) { + aom_tree_index root; + root = i; + do { + aom_cdf_prob cdf[16]; + aom_tree_index index[16]; + int path[16]; + int dist[16]; + int nsymbs; + int symb; + int j; + /* Compute the CDF of the binary tree using the given probabilities. */ + nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist); + /* Find the symbol to code. */ + symb = -1; + for (j = 0; j < nsymbs; j++) { + /* If this symbol codes a leaf node, */ + if (index[j] <= 0) { + if (len == dist[j] && path[j] == bits) { + symb = j; + break; + } + } else { + if (len > dist[j] && path[j] == bits >> (len - dist[j])) { + symb = j; + break; + } + } + } + OD_ASSERT(symb != -1); + aom_write_cdf(w, symb, cdf, nsymbs); + bits &= (1 << (len - dist[symb])) - 1; + len -= dist[symb]; + } while (len); +} + +#endif // CONFIG_EC_MULTISYMBOL + +static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree, + const aom_prob *probs, int bits, int len, + aom_tree_index i) { +#if CONFIG_EC_MULTISYMBOL + aom_write_tree_as_cdf(w, tree, probs, bits, len, i); +#else + aom_write_tree_as_bits(w, tree, probs, bits, len, i); +#endif +} + +static INLINE void aom_write_tree_record(aom_writer *w, + const aom_tree_index *tree, + const aom_prob *probs, int bits, + int len, aom_tree_index i, + TOKEN_STATS *token_stats) { +#if CONFIG_EC_MULTISYMBOL + (void)token_stats; + aom_write_tree_as_cdf(w, tree, probs, bits, len, i); +#else + aom_write_tree_as_bits_record(w, tree, probs, bits, len, i, token_stats); +#endif +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITWRITER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c new file mode 100644 index 000000000..1b3dd2913 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./bitwriter_buffer.h" + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) { + return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); +} + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) { + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + if (q == CHAR_BIT - 1) { + // Zero next char and write bit + wb->bit_buffer[p] = bit << q; + } else { + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + } + wb->bit_offset = off + 1; +} + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { + // Do not zero bytes but overwrite exisiting values + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + wb->bit_offset = off + 1; +} + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); +} + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) + aom_wb_overwrite_bit(wb, (data >> bit) & 1); +} + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + aom_wb_write_literal(wb, data, bits + 1); +} diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h new file mode 100644 index 000000000..1f23dc857 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter_buffer.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITWRITER_BUFFER_H_ +#define AOM_DSP_BITWRITER_BUFFER_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_write_bit_buffer { + uint8_t *bit_buffer; + uint32_t bit_offset; +}; + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb); + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits); + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITWRITER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h new file mode 100644 index 000000000..e5297ff83 --- /dev/null +++ b/third_party/aom/aom_dsp/blend.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BLEND_H_ +#define AOM_DSP_BLEND_H_ + +#include "aom_ports/mem.h" + +// Various blending functions and macros. +// See also the aom_blend_* functions in aom_dsp_rtcd.h + +// Alpha blending with alpha values from the range [0, 64], where 64 +// means use the first input and 0 means use the second input. + +#define AOM_BLEND_A64_ROUND_BITS 6 +#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64 + +#define AOM_BLEND_A64(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A64_ROUND_BITS) + +// Alpha blending with alpha values from the range [0, 256], where 256 +// means use the first input and 0 means use the second input. +#define AOM_BLEND_A256_ROUND_BITS 8 +#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256 + +#define AOM_BLEND_A256(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A256_ROUND_BITS) + +// Blending by averaging. +#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) + +#endif // AOM_DSP_BLEND_H_ diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c new file mode 100644 index 000000000..99b4b8a59 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_hmask.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "./aom_dsp_rtcd.h" + +void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c new file mode 100644 index 000000000..3e15542c9 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_mask.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "./aom_dsp_rtcd.h" + +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + int w, int subh, int subw) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int subh, int subw, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c new file mode 100644 index 000000000..1a5e30e31 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_vmask.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "./aom_dsp_rtcd.h" + +void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c new file mode 100644 index 000000000..8fe1ff763 --- /dev/null +++ b/third_party/aom/aom_dsp/buf_ans.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/buf_ans.h" +#include "aom_mem/aom_mem.h" +#include "aom/internal/aom_codec_internal.h" + +void aom_buf_ans_alloc(struct BufAnsCoder *c, + struct aom_internal_error_info *error, int size) { + c->error = error; + c->size = size; + assert(c->size > 1); + AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf))); + // Initialize to overfull to trigger the assert in write. + c->offset = c->size + 1; +} + +void aom_buf_ans_free(struct BufAnsCoder *c) { + aom_free(c->buf); + c->buf = NULL; + c->size = 0; +} + +#if !ANS_MAX_SYMBOLS +void aom_buf_ans_grow(struct BufAnsCoder *c) { + struct buffered_ans_symbol *new_buf = NULL; + int new_size = c->size * 2; + AOM_CHECK_MEM_ERROR(c->error, new_buf, + aom_malloc(new_size * sizeof(*new_buf))); + memcpy(new_buf, c->buf, c->size * sizeof(*c->buf)); + aom_free(c->buf); + c->buf = new_buf; + c->size = new_size; +} +#endif + +void aom_buf_ans_flush(struct BufAnsCoder *const c) { + int offset; +#if ANS_MAX_SYMBOLS + if (c->offset == 0) return; +#endif + assert(c->offset > 0); + offset = c->offset - 1; + // Code the first symbol such that it brings the state to the smallest normal + // state from an initial state that would have been a subnormal/refill state. + if (c->buf[offset].method == ANS_METHOD_RANS) { + c->ans.state += c->buf[offset].val_start; + } else { + c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0; + } + for (offset = offset - 1; offset >= 0; --offset) { + if (c->buf[offset].method == ANS_METHOD_RANS) { + rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob); + } else { + rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start, + (AnsP8)c->buf[offset].prob); + } + } + c->offset = 0; + c->output_bytes += ans_write_end(&c->ans); +} diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h new file mode 100644 index 000000000..0768506b3 --- /dev/null +++ b/third_party/aom/aom_dsp/buf_ans.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BUF_ANS_H_ +#define AOM_DSP_BUF_ANS_H_ +// Buffered forward ANS writer. +// Symbols are written to the writer in forward (decode) order and serialized +// backwards due to ANS's stack like behavior. + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/ans.h" +#include "aom_dsp/answriter.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#define ANS_METHOD_RABS 0 +#define ANS_METHOD_RANS 1 + +struct buffered_ans_symbol { + unsigned int method : 1; // one of ANS_METHOD_RABS or ANS_METHOD_RANS + // TODO(aconverse): Should be possible to write this in terms of start for ABS + unsigned int val_start : RANS_PROB_BITS; // Boolean value for ABS + // start in symbol cycle for Rans + unsigned int prob : RANS_PROB_BITS; // Probability of this symbol +}; + +struct BufAnsCoder { + struct aom_internal_error_info *error; + struct buffered_ans_symbol *buf; + struct AnsCoder ans; + int size; + int offset; + int output_bytes; +#if ANS_MAX_SYMBOLS + int window_size; +#endif +}; + +// Allocate a buffered ANS coder to store size symbols. +// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS +// partition. +// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the +// buffer will grow on demand +void aom_buf_ans_alloc(struct BufAnsCoder *c, + struct aom_internal_error_info *error, int hint); + +void aom_buf_ans_free(struct BufAnsCoder *c); + +#if !ANS_MAX_SYMBOLS +void aom_buf_ans_grow(struct BufAnsCoder *c); +#endif + +void aom_buf_ans_flush(struct BufAnsCoder *const c); + +static INLINE void buf_ans_write_init(struct BufAnsCoder *const c, + uint8_t *const output_buffer) { + c->offset = 0; + c->output_bytes = 0; + ans_write_init(&c->ans, output_buffer); +} + +static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val, + AnsP8 prob) { + assert(c->offset <= c->size); +#if !ANS_MAX_SYMBOLS + if (c->offset == c->size) { + aom_buf_ans_grow(c); + } +#endif + c->buf[c->offset].method = ANS_METHOD_RABS; + c->buf[c->offset].val_start = val; + c->buf[c->offset].prob = prob; + ++c->offset; +#if ANS_MAX_SYMBOLS + if (c->offset == c->size) aom_buf_ans_flush(c); +#endif +} + +// Buffer one symbol for encoding using rANS. +// cum_prob: The cumulative probability before this symbol (the offset of +// the symbol in the symbol cycle) +// prob: The probability of this symbol (l_s from the paper) +// RANS_PRECISION takes the place of m from the paper. +static INLINE void buf_rans_write(struct BufAnsCoder *const c, + aom_cdf_prob cum_prob, aom_cdf_prob prob) { + assert(c->offset <= c->size); +#if !ANS_MAX_SYMBOLS + if (c->offset == c->size) { + aom_buf_ans_grow(c); + } +#endif + c->buf[c->offset].method = ANS_METHOD_RANS; + c->buf[c->offset].val_start = cum_prob; + c->buf[c->offset].prob = prob; + ++c->offset; +#if ANS_MAX_SYMBOLS + if (c->offset == c->size) aom_buf_ans_flush(c); +#endif +} + +static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) { + buf_rabs_write(c, bit, 128); +} + +static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal, + int bits) { + int bit; + + assert(bits < 31); + for (bit = bits - 1; bit >= 0; bit--) + buf_rabs_write_bit(c, 1 & (literal >> bit)); +} + +static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) { + assert(c->offset == 0); + return c->output_bytes; +} +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_BUF_ANS_H_ diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c new file mode 100644 index 000000000..0fc7b14a5 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolreader.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/daalaboolreader.h" + +int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) { + if (size && !buffer) { + return 1; + } + r->buffer_end = buffer + size; + r->buffer = buffer; + od_ec_dec_init(&r->ec, buffer, size - 1); +#if CONFIG_ACCOUNTING + r->accounting = NULL; +#endif + return 0; +} + +const uint8_t *aom_daala_reader_find_end(daala_reader *r) { + return r->buffer_end; +} + +uint32_t aom_daala_reader_tell(const daala_reader *r) { + return od_ec_dec_tell(&r->ec); +} + +uint32_t aom_daala_reader_tell_frac(const daala_reader *r) { + return od_ec_dec_tell_frac(&r->ec); +} diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h new file mode 100644 index 000000000..428d74db0 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolreader.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DAALABOOLREADER_H_ +#define AOM_DSP_DAALABOOLREADER_H_ + +#include "aom/aom_integer.h" +#include "aom_dsp/entdec.h" +#include "aom_dsp/prob.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif +#if CONFIG_BITSTREAM_DEBUG +#include +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#ifdef __cplusplus +extern "C" { +#endif + +struct daala_reader { + const uint8_t *buffer; + const uint8_t *buffer_end; + od_ec_dec ec; +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif +}; + +typedef struct daala_reader daala_reader; + +int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size); +const uint8_t *aom_daala_reader_find_end(daala_reader *r); +uint32_t aom_daala_reader_tell(const daala_reader *r); +uint32_t aom_daala_reader_tell_frac(const daala_reader *r); + +static INLINE int aom_daala_read(daala_reader *r, int prob) { + int bit; +#if CONFIG_EC_SMALLMUL + int p = (0x7FFFFF - (prob << 15) + prob) >> 8; +#else + int p = ((prob << 15) + 256 - prob) >> 8; +#endif +#if CONFIG_BITSTREAM_DEBUG +/*{ + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + if (frame_idx == 0 && queue_r == 0) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n", + frame_idx, queue_r); + } +}*/ +#endif + + bit = od_ec_decode_bool_q15(&r->ec, p); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int ref_bit, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs); + if (ref_nsymbs != 2) { + fprintf(stderr, + "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs " + "%d queue_r %d\n", + frame_idx, 2, ref_nsymbs, queue_r); + assert(0); + } + if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) || + (ref_cdf[1] != 32767)) { + fprintf(stderr, + "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d", + frame_idx, p, 32767, ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (bit != ref_bit) { + fprintf(stderr, + "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d " + "queue_r %d\n", + frame_idx, bit, ref_bit, queue_r); + assert(0); + } + } +#endif + + return bit; +} + +#if CONFIG_RAWBITS +static INLINE int aom_daala_read_bit(daala_reader *r) { + return od_ec_dec_bits(&r->ec, 1, "aom_bits"); +} +#endif + +static INLINE int aom_daala_reader_has_error(daala_reader *r) { + return r->ec.error; +} + +static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf, + int nsymbs) { + int symb; + symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int cdf_error = 0; + int ref_symb, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs); + if (nsymbs != ref_nsymbs) { + fprintf(stderr, + "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d " + "queue_r %d\n", + frame_idx, nsymbs, ref_nsymbs, queue_r); + cdf_error = 0; + assert(0); + } else { + for (i = 0; i < nsymbs; ++i) + if (cdf[i] != ref_cdf[i]) cdf_error = 1; + } + if (cdf_error) { + fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx, + cdf[0]); + for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]); + fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (symb != ref_symb) { + fprintf( + stderr, + "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n", + frame_idx, symb, ref_symb, queue_r); + assert(0); + } + } +#endif + + return symb; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c new file mode 100644 index 000000000..0ba8f6ab8 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolwriter.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/daalaboolwriter.h" + +void aom_daala_start_encode(daala_writer *br, uint8_t *source) { + br->buffer = source; + br->pos = 0; + od_ec_enc_init(&br->ec, 62025); +} + +void aom_daala_stop_encode(daala_writer *br) { + uint32_t daala_bytes; + unsigned char *daala_data; + daala_data = od_ec_enc_done(&br->ec, &daala_bytes); + memcpy(br->buffer, daala_data, daala_bytes); + br->pos = daala_bytes; + /* Prevent ec bitstream from being detected as a superframe marker. + Must always be added, so that rawbits knows the exact length of the + bitstream. */ + br->buffer[br->pos++] = 0; + od_ec_enc_clear(&br->ec); +} diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h new file mode 100644 index 000000000..bbaf53c69 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolwriter.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DAALABOOLWRITER_H_ +#define AOM_DSP_DAALABOOLWRITER_H_ + +#include + +#include "aom_dsp/entenc.h" +#include "aom_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#ifdef __cplusplus +extern "C" { +#endif + +struct daala_writer { + unsigned int pos; + uint8_t *buffer; + od_ec_enc ec; +}; + +typedef struct daala_writer daala_writer; + +void aom_daala_start_encode(daala_writer *w, uint8_t *buffer); +void aom_daala_stop_encode(daala_writer *w); + +static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) { +#if CONFIG_EC_SMALLMUL + int p = (0x7FFFFF - (prob << 15) + prob) >> 8; +#else + int p = ((prob << 15) + 256 - prob) >> 8; +#endif +#if CONFIG_BITSTREAM_DEBUG + aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; + /*int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = bitstream_queue_get_frame_write(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + }*/ + bitstream_queue_push(bit, cdf, 2); +#endif + + od_ec_encode_bool_q15(&w->ec, bit, p); +} + +#if CONFIG_RAWBITS +static INLINE void aom_daala_write_bit(daala_writer *w, int bit) { + od_ec_enc_bits(&w->ec, bit, 1); +} +#endif + +static INLINE void daala_write_symbol(daala_writer *w, int symb, + const aom_cdf_prob *cdf, int nsymbs) { +#if CONFIG_BITSTREAM_DEBUG + /*int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = bitstream_queue_get_frame_write(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + }*/ + bitstream_queue_push(symb, cdf, nsymbs); +#endif + + od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/dkboolreader.c b/third_party/aom/aom_dsp/dkboolreader.c new file mode 100644 index 000000000..288d5f1ce --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolreader.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" + +#include "aom_dsp/dkboolreader.h" +#include "aom_dsp/prob.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "aom_mem/aom_mem.h" +#include "aom_util/endian_inl.h" + +static INLINE int aom_dk_read_bit(struct aom_dk_reader *r) { + return aom_dk_read(r, 128); // aom_prob_half +} + +int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer, + size_t size, aom_decrypt_cb decrypt_cb, + void *decrypt_state) { + if (size && !buffer) { + return 1; + } else { + r->buffer_end = buffer + size; + r->buffer_start = r->buffer = buffer; + r->value = 0; + r->count = -8; + r->range = 255; + r->decrypt_cb = decrypt_cb; + r->decrypt_state = decrypt_state; + aom_dk_reader_fill(r); +#if CONFIG_ACCOUNTING + r->accounting = NULL; +#endif + return aom_dk_read_bit(r) != 0; // marker bit + } +} + +void aom_dk_reader_fill(struct aom_dk_reader *r) { + const uint8_t *const buffer_end = r->buffer_end; + const uint8_t *buffer = r->buffer; + const uint8_t *buffer_start = buffer; + BD_VALUE value = r->value; + int count = r->count; + const size_t bytes_left = buffer_end - buffer; + const size_t bits_left = bytes_left * CHAR_BIT; + int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); + + if (r->decrypt_cb) { + size_t n = AOMMIN(sizeof(r->clear_buffer), bytes_left); + r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n); + buffer = r->clear_buffer; + buffer_start = r->clear_buffer; + } + if (bits_left > BD_VALUE_SIZE) { + const int bits = (shift & 0xfffffff8) + CHAR_BIT; + BD_VALUE nv; + BD_VALUE big_endian_values; + memcpy(&big_endian_values, buffer, sizeof(BD_VALUE)); +#if SIZE_MAX == 0xffffffffffffffffULL + big_endian_values = HToBE64(big_endian_values); +#else + big_endian_values = HToBE32(big_endian_values); +#endif + nv = big_endian_values >> (BD_VALUE_SIZE - bits); + count += bits; + buffer += (bits >> 3); + value = r->value | (nv << (shift & 0x7)); + } else { + const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left); + int loop_end = 0; + if (bits_over >= 0) { + count += LOTS_OF_BITS; + loop_end = bits_over; + } + + if (bits_over < 0 || bits_left) { + while (shift >= loop_end) { + count += CHAR_BIT; + value |= (BD_VALUE)*buffer++ << shift; + shift -= CHAR_BIT; + } + } + } + + // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption, + // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than + // assign 'buffer' to 'r->buffer'. + r->buffer += buffer - buffer_start; + r->value = value; + r->count = count; +} + +const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r) { + // Find the end of the coded buffer + while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) { + r->count -= CHAR_BIT; + r->buffer--; + } + return r->buffer; +} diff --git a/third_party/aom/aom_dsp/dkboolreader.h b/third_party/aom/aom_dsp/dkboolreader.h new file mode 100644 index 000000000..f0bc84381 --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolreader.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DKBOOLREADER_H_ +#define AOM_DSP_DKBOOLREADER_H_ + +#include +#include +#include + +#include "./aom_config.h" +#if CONFIG_BITSTREAM_DEBUG +#include +#include +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "aom_ports/mem.h" +#include "aom/aomdx.h" +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef size_t BD_VALUE; + +#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT) + +// This is meant to be a large, positive constant that can still be efficiently +// loaded as an immediate (on platforms like ARM, for example). +// Even relatively modest values like 100 would work fine. +#define LOTS_OF_BITS 0x40000000 + +struct aom_dk_reader { + // Be careful when reordering this struct, it may impact the cache negatively. + BD_VALUE value; + unsigned int range; + int count; + const uint8_t *buffer_start; + const uint8_t *buffer_end; + const uint8_t *buffer; + aom_decrypt_cb decrypt_cb; + void *decrypt_state; + uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif +}; + +int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer, + size_t size, aom_decrypt_cb decrypt_cb, + void *decrypt_state); + +void aom_dk_reader_fill(struct aom_dk_reader *r); + +const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r); + +static INLINE uint32_t aom_dk_reader_tell(const struct aom_dk_reader *r) { + const uint32_t bits_read = + (uint32_t)((r->buffer - r->buffer_start) * CHAR_BIT); + const int count = + (r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS; + assert(r->buffer >= r->buffer_start); + return bits_read - (count + CHAR_BIT); +} + +/*The resolution of fractional-precision bit usage measurements, i.e., + 3 => 1/8th bits.*/ +#define DK_BITRES (3) + +static INLINE uint32_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) { + uint32_t num_bits; + uint32_t range; + int l; + int i; + num_bits = aom_dk_reader_tell(r) << DK_BITRES; + range = r->range; + l = 0; + for (i = DK_BITRES; i-- > 0;) { + int b; + range = range * range >> 7; + b = (int)(range >> 8); + l = l << 1 | b; + range >>= b; + } + return num_bits - l; +} + +static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) { + // Check if we have reached the end of the buffer. + // + // Variable 'count' stores the number of bits in the 'value' buffer, minus + // 8. The top byte is part of the algorithm, and the remainder is buffered + // to be shifted into it. So if count == 8, the top 16 bits of 'value' are + // occupied, 8 for the algorithm and 8 in the buffer. + // + // When reading a byte from the user's buffer, count is filled with 8 and + // one byte is filled into the value buffer. When we reach the end of the + // data, count is additionally filled with LOTS_OF_BITS. So when + // count == LOTS_OF_BITS - 1, the user's data has been exhausted. + // + // 1 if we have tried to decode bits after the end of stream was encountered. + // 0 No error. + return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS; +} + +static INLINE int aom_dk_read(struct aom_dk_reader *r, int prob) { + unsigned int bit = 0; + BD_VALUE value; + BD_VALUE bigsplit; + int count; + unsigned int range; + unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; + + if (r->count < 0) aom_dk_reader_fill(r); + + value = r->value; + count = r->count; + + bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); + + range = split; + + if (value >= bigsplit) { + range = r->range - split; + value = value - bigsplit; + bit = 1; + } + + { + register int shift = aom_norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + r->value = value; + r->count = count; + r->range = range; + +#if CONFIG_BITSTREAM_DEBUG + { + int ref_bit, ref_prob; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_bit, &ref_prob); + if (prob != ref_prob) { + fprintf( + stderr, + "\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + assert(0); + } + if ((int)bit != ref_bit) { + fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n", + frame_idx, bit, ref_bit); + assert(0); + } + } +#endif // CONFIG_BITSTREAM_DEBUG + + return bit; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_DKBOOLREADER_H_ diff --git a/third_party/aom/aom_dsp/dkboolwriter.c b/third_party/aom/aom_dsp/dkboolwriter.c new file mode 100644 index 000000000..fc98e7c9b --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolwriter.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./dkboolwriter.h" + +static INLINE void aom_dk_write_bit(aom_dk_writer *w, int bit) { + aom_dk_write(w, bit, 128); // aom_prob_half +} + +void aom_dk_start_encode(aom_dk_writer *br, uint8_t *source) { + br->lowvalue = 0; + br->range = 255; + br->count = -24; + br->buffer = source; + br->pos = 0; + aom_dk_write_bit(br, 0); +} + +void aom_dk_stop_encode(aom_dk_writer *br) { + int i; + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(1); +#endif // CONFIG_BITSTREAM_DEBUG + + for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0); + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(0); +#endif // CONFIG_BITSTREAM_DEBUG + + // Ensure there's no ambigous collision with any index marker bytes + if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0; +} diff --git a/third_party/aom/aom_dsp/dkboolwriter.h b/third_party/aom/aom_dsp/dkboolwriter.h new file mode 100644 index 000000000..835436885 --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolwriter.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DKBOOLWRITER_H_ +#define AOM_DSP_DKBOOLWRITER_H_ + +#include "./aom_config.h" + +#if CONFIG_BITSTREAM_DEBUG +#include +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "aom_dsp/prob.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct aom_dk_writer { + unsigned int lowvalue; + unsigned int range; + int count; + unsigned int pos; + uint8_t *buffer; +} aom_dk_writer; + +void aom_dk_start_encode(aom_dk_writer *bc, uint8_t *buffer); +void aom_dk_stop_encode(aom_dk_writer *bc); + +static INLINE void aom_dk_write(aom_dk_writer *br, int bit, int probability) { + unsigned int split; + int count = br->count; + unsigned int range = br->range; + unsigned int lowvalue = br->lowvalue; + register int shift; + +#if CONFIG_BITSTREAM_DEBUG + // int queue_r = 0; + // int frame_idx_r = 0; + // int queue_w = bitstream_queue_get_write(); + // int frame_idx_w = bitstream_queue_get_frame_write(); + // if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + // fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + // frame_idx_w, queue_w); + // } + bitstream_queue_push(bit, probability); +#endif // CONFIG_BITSTREAM_DEBUG + + split = 1 + (((range - 1) * probability) >> 8); + + range = split; + + if (bit) { + lowvalue += split; + range = br->range - split; + } + + shift = aom_norm[range]; + + range <<= shift; + count += shift; + + if (count >= 0) { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) { + int x = br->pos - 1; + + while (x >= 0 && br->buffer[x] == 0xff) { + br->buffer[x] = 0; + x--; + } + + br->buffer[x] += 1; + } + + br->buffer[br->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8; + } + + lowvalue <<= shift; + br->count = count; + br->lowvalue = lowvalue; + br->range = range; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_DKBOOLWRITER_H_ diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c new file mode 100644 index 000000000..ad76b7e3e --- /dev/null +++ b/third_party/aom/aom_dsp/entcode.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifdef HAVE_CONFIG_H +#include "./config.h" +#endif + +#include "aom_dsp/entcode.h" + +/*Given the current total integer number of bits used and the current value of + rng, computes the fraction number of bits used to OD_BITRES precision. + This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac(). + nbits_total: The number of whole bits currently used, i.e., the value + returned by od_ec_enc_tell() or od_ec_dec_tell(). + rng: The current value of rng from either the encoder or decoder state. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) { + uint32_t nbits; + int l; + int i; + /*To handle the non-integral number of bits still left in the encoder/decoder + state, we compute the worst-case number of bits of val that must be + encoded to ensure that the value is inside the range for any possible + subsequent bits. + The computation here is independent of val itself (the decoder does not + even track that value), even though the real number of bits used after + od_ec_enc_done() may be 1 smaller if rng is a power of two and the + corresponding trailing bits of val are all zeros. + If we did try to track that special case, then coding a value with a + probability of 1/(1 << n) might sometimes appear to use more than n bits. + This may help explain the surprising result that a newly initialized + encoder or decoder claims to have used 1 bit.*/ + nbits = nbits_total << OD_BITRES; + l = 0; + for (i = OD_BITRES; i-- > 0;) { + int b; + rng = rng * rng >> 15; + b = (int)(rng >> 16); + l = l << 1 | b; + rng >>= b; + } + return nbits - l; +} diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h new file mode 100644 index 000000000..534959e66 --- /dev/null +++ b/third_party/aom/aom_dsp/entcode.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if !defined(_entcode_H) +#define _entcode_H (1) +#include +#include +#include "av1/common/odintrin.h" + +/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic + on a larger type, you can speed up the decoder by using it here.*/ +typedef uint32_t od_ec_window; + +#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT) + +/*The number of bits to use for the range-coded part of unsigned integers.*/ +#define OD_EC_UINT_BITS (4) + +/*The resolution of fractional-precision bit usage measurements, i.e., + 3 => 1/8th bits.*/ +#define OD_BITRES (3) + +/*With CONFIG_EC_SMALLMUL, the value stored in a CDF is 32768 minus the actual + Q15 cumulative probability (an "inverse" CDF). + This function converts from one representation to the other (and is its own + inverse).*/ +#if CONFIG_EC_SMALLMUL +#define OD_ICDF(x) (32768U - (x)) +#else +#define OD_ICDF(x) (x) +#endif + +/*See entcode.c for further documentation.*/ + +OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, + uint32_t rng); + +#endif diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c new file mode 100644 index 000000000..49b176cd8 --- /dev/null +++ b/third_party/aom/aom_dsp/entdec.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifdef HAVE_CONFIG_H +#include "./config.h" +#endif + +#include "aom_dsp/entdec.h" + +/*A range decoder. + This is an entropy decoder based upon \cite{Mar79}, which is itself a + rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. + It is very similar to arithmetic encoding, except that encoding is done with + digits in any base, instead of with bits, and so it is faster when using + larger bases (i.e.: a byte). + The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ + is the base, longer than the theoretical optimum, but to my knowledge there + is no published justification for this claim. + This only seems true when using near-infinite precision arithmetic so that + the process is carried out with no rounding errors. + + An excellent description of implementation details is available at + http://www.arturocampos.com/ac_range.html + A recent work \cite{MNW98} which proposes several changes to arithmetic + encoding for efficiency actually re-discovers many of the principles + behind range encoding, and presents a good theoretical analysis of them. + + End of stream is handled by writing out the smallest number of bits that + ensures that the stream will be correctly decoded regardless of the value of + any subsequent bits. + od_ec_dec_tell() can be used to determine how many bits were needed to decode + all the symbols thus far; other data can be packed in the remaining bits of + the input buffer. + @PHDTHESIS{Pas76, + author="Richard Clark Pasco", + title="Source coding algorithms for fast data compression", + school="Dept. of Electrical Engineering, Stanford University", + address="Stanford, CA", + month=May, + year=1976, + URL="http://www.richpasco.org/scaffdc.pdf" + } + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video & Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*This is meant to be a large, positive constant that can still be efficiently + loaded as an immediate (on platforms like ARM, for example). + Even relatively modest values like 100 would work fine.*/ +#define OD_EC_LOTS_OF_BITS (0x4000) + +static void od_ec_dec_refill(od_ec_dec *dec) { + int s; + od_ec_window dif; + int16_t cnt; + const unsigned char *bptr; + const unsigned char *end; + dif = dec->dif; + cnt = dec->cnt; + bptr = dec->bptr; + end = dec->end; + s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15); + for (; s >= 0 && bptr < end; s -= 8, bptr++) { + OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8); + dif ^= (od_ec_window)bptr[0] << s; + cnt += 8; + } + if (bptr >= end) { + dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt; + cnt = OD_EC_LOTS_OF_BITS; + } + dec->dif = dif; + dec->cnt = cnt; + dec->bptr = bptr; +} + +/*Takes updated dif and range values, renormalizes them so that + 32768 <= rng < 65536 (reading more bytes from the stream into dif if + necessary), and stores them back in the decoder context. + dif: The new value of dif. + rng: The new value of the range. + ret: The value to return. + Return: ret. + This allows the compiler to jump to this function via a tail-call.*/ +static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, + int ret) { + int d; + OD_ASSERT(rng <= 65535U); + d = 16 - OD_ILOG_NZ(rng); + dec->cnt -= d; +#if CONFIG_EC_SMALLMUL + /*This is equivalent to shifting in 1's instead of 0's.*/ + dec->dif = ((dif + 1) << d) - 1; +#else + dec->dif = dif << d; +#endif + dec->rng = rng << d; + if (dec->cnt < 0) od_ec_dec_refill(dec); + return ret; +} + +/*Initializes the decoder. + buf: The input buffer to use. + Return: 0 on success, or a negative value on error.*/ +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, + uint32_t storage) { + dec->buf = buf; + dec->eptr = buf + storage; + dec->end_window = 0; + dec->nend_bits = 0; + dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); + dec->end = buf + storage; + dec->bptr = buf; +#if CONFIG_EC_SMALLMUL + dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; +#else + dec->dif = 0; +#endif + dec->rng = 0x8000; + dec->cnt = -15; + dec->error = 0; + od_ec_dec_refill(dec); +} + +/*Decode a single binary value. + {EC_SMALLMUL} f: The probability that the bit is one, scaled by 32768. + {else} f: The probability that the bit is zero, scaled by 32768. + Return: The value decoded (0 or 1).*/ +int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { + od_ec_window dif; + od_ec_window vw; + unsigned r; + unsigned r_new; + unsigned v; + int ret; + OD_ASSERT(0 < f); + OD_ASSERT(f < 32768U); + dif = dec->dif; + r = dec->rng; + OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + v = (r >> 8) * (uint32_t)f >> 7; + vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + ret = 1; + r_new = v; + if (dif >= vw) { + r_new = r - v; + dif -= vw; + ret = 0; + } +#else + v = f * (uint32_t)r >> 15; + vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + ret = 0; + r_new = v; + if (dif >= vw) { + r_new = r - v; + dif -= vw; + ret = 1; + } +#endif + return od_ec_dec_normalize(dec, dif, r_new, ret); +} + +/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15. + cdf: The CDF, such that symbol s falls in the range + [s > 0 ? cdf[s - 1] : 0, cdf[s]). + The values must be monotonically non-increasing, and cdf[nsyms - 1] + must be 32768. + {EC_SMALLMUL}: The CDF contains 32768 minus those values. + nsyms: The number of symbols in the alphabet. + This should be at most 16. + Return: The decoded symbol s.*/ +int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) { + od_ec_window dif; + unsigned r; + unsigned c; + unsigned u; + unsigned v; + int ret; + (void)nsyms; + dif = dec->dif; + r = dec->rng; + OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); + v = r; + ret = -1; + do { + u = v; + v = (r >> 8) * (uint32_t)cdf[++ret] >> 7; + } while (c < v); + OD_ASSERT(v < u); + OD_ASSERT(u <= r); + r = u - v; + dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); +#else + c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); + v = 0; + ret = -1; + do { + u = v; + v = cdf[++ret] * (uint32_t)r >> 15; + } while (v <= c); + OD_ASSERT(u < v); + OD_ASSERT(v <= r); + r = v - u; + dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16); +#endif + return od_ec_dec_normalize(dec, dif, r, ret); +} + +#if CONFIG_RAWBITS +/*Extracts a sequence of raw bits from the stream. + The bits must have been encoded with od_ec_enc_bits(). + ftb: The number of bits to extract. + This must be between 0 and 25, inclusive. + Return: The decoded bits.*/ +uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) { + od_ec_window window; + int available; + uint32_t ret; + OD_ASSERT(ftb <= 25); + window = dec->end_window; + available = dec->nend_bits; + if ((unsigned)available < ftb) { + const unsigned char *buf; + const unsigned char *eptr; + buf = dec->buf; + eptr = dec->eptr; + OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8); + do { + if (eptr <= buf) { + dec->tell_offs += OD_EC_LOTS_OF_BITS - available; + available = OD_EC_LOTS_OF_BITS; + break; + } + window |= (od_ec_window) * --eptr << available; + available += 8; + } while (available <= OD_EC_WINDOW_SIZE - 8); + dec->eptr = eptr; + } + ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1); + window >>= ftb; + available -= ftb; + dec->end_window = window; + dec->nend_bits = available; + return ret; +} +#endif + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_dec_tell(const od_ec_dec *dec) { + return (int)(((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 - + dec->cnt - dec->nend_bits + dec->tell_offs); +} + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) { + return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng); +} diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h new file mode 100644 index 000000000..e1145e81d --- /dev/null +++ b/third_party/aom/aom_dsp/entdec.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if !defined(_entdec_H) +#define _entdec_H (1) +#include +#include "aom_dsp/entcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct od_ec_dec od_ec_dec; + +#if defined(OD_ACCOUNTING) && OD_ACCOUNTING +#define OD_ACC_STR , char *acc_str +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str) +#else +#define OD_ACC_STR +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb) +#endif + +/*The entropy decoder context.*/ +struct od_ec_dec { + /*The start of the current input buffer.*/ + const unsigned char *buf; + /*The read pointer for the raw bits.*/ + const unsigned char *eptr; + /*Bits that will be read from/written at the end.*/ + od_ec_window end_window; + /*Number of valid bits in end_window.*/ + int nend_bits; + /*An offset used to keep track of tell after reaching the end of the stream. + This is constant throughout most of the decoding process, but becomes + important once we hit the end of the buffer and stop incrementing pointers + (and instead pretend cnt/nend_bits have lots of bits).*/ + int32_t tell_offs; + /*The end of the current input buffer.*/ + const unsigned char *end; + /*The read pointer for the entropy-coded bits.*/ + const unsigned char *bptr; + /*The difference between the coded value and the low end of the current + range. + {EC_SMALLMUL} The difference between the high end of the current range, + (low + rng), and the coded value, minus 1. + This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the + decoder only uses the top 16 bits of the window to decode the next symbol. + As we shift up during renormalization, if we don't have enough bits left in + the window to fill the top 16, we'll read in more bits of the coded + value.*/ + od_ec_window dif; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; + /*Nonzero if an error occurred.*/ + int error; +}; + +/*See entdec.c for further documentation.*/ + +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec, + const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) + OD_ARG_NONNULL(1); + +OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) + OD_ARG_NONNULL(1); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c new file mode 100644 index 000000000..a350f27f4 --- /dev/null +++ b/third_party/aom/aom_dsp/entenc.c @@ -0,0 +1,507 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifdef HAVE_CONFIG_H +#include "./config.h" +#endif + +#include +#include +#include "aom_dsp/entenc.h" + +/*A range encoder. + See entdec.c and the references for implementation details \cite{Mar79,MNW98}. + + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video \& Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*Takes updated low and range values, renormalizes them so that + 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if + necessary), and stores them back in the encoder context. + low: The new value of low. + rng: The new value of the range.*/ +static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, + unsigned rng) { + int d; + int c; + int s; + c = enc->cnt; + OD_ASSERT(rng <= 65535U); + d = 16 - OD_ILOG_NZ(rng); + s = c + d; + /*TODO: Right now we flush every time we have at least one byte available. + Instead we should use an od_ec_window and flush right before we're about to + shift bits off the end of the window. + For a 32-bit window this is about the same amount of work, but for a 64-bit + window it should be a fair win.*/ + if (s >= 0) { + uint16_t *buf; + uint32_t storage; + uint32_t offs; + unsigned m; + buf = enc->precarry_buf; + storage = enc->precarry_storage; + offs = enc->offs; + if (offs + 2 > storage) { + storage = 2 * storage + 2; + buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); + if (buf == NULL) { + enc->error = -1; + enc->offs = 0; + return; + } + enc->precarry_buf = buf; + enc->precarry_storage = storage; + } + c += 16; + m = (1 << c) - 1; + if (s >= 8) { + OD_ASSERT(offs < storage); + buf[offs++] = (uint16_t)(low >> c); + low &= m; + c -= 8; + m >>= 8; + } + OD_ASSERT(offs < storage); + buf[offs++] = (uint16_t)(low >> c); + s = c + d - 24; + low &= m; + enc->offs = offs; + } + enc->low = low << d; + enc->rng = rng << d; + enc->cnt = s; +} + +/*Initializes the encoder. + size: The initial size of the buffer, in bytes.*/ +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) { + od_ec_enc_reset(enc); + enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size); + enc->storage = size; + if (size > 0 && enc->buf == NULL) { + enc->storage = 0; + enc->error = -1; + } + enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size); + enc->precarry_storage = size; + if (size > 0 && enc->precarry_buf == NULL) { + enc->precarry_storage = 0; + enc->error = -1; + } +} + +/*Reinitializes the encoder.*/ +void od_ec_enc_reset(od_ec_enc *enc) { + enc->end_offs = 0; + enc->end_window = 0; + enc->nend_bits = 0; + enc->offs = 0; + enc->low = 0; + enc->rng = 0x8000; + /*This is initialized to -9 so that it crosses zero after we've accumulated + one byte + one carry bit.*/ + enc->cnt = -9; + enc->error = 0; +#if OD_MEASURE_EC_OVERHEAD + enc->entropy = 0; + enc->nb_symbols = 0; +#endif +} + +/*Frees the buffers used by the encoder.*/ +void od_ec_enc_clear(od_ec_enc *enc) { + free(enc->precarry_buf); + free(enc->buf); +} + +/*Encodes a symbol given its frequency in Q15. + fl: The cumulative frequency of all symbols that come before the one to be + encoded. + fh: The cumulative frequency of all symbols up to and including the one to + be encoded. + {EC_SMALLMUL} Both values are 32768 minus that.*/ +static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { + od_ec_window l; + unsigned r; + unsigned u; + unsigned v; + l = enc->low; + r = enc->rng; + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + OD_ASSERT(fh < fl); + OD_ASSERT(fl <= 32768U); + if (fl < 32768U) { + u = (r >> 8) * (uint32_t)fl >> 7; + v = (r >> 8) * (uint32_t)fh >> 7; + l += r - u; + r = u - v; + } else { + r -= (r >> 8) * (uint32_t)fh >> 7; + } +#else + OD_ASSERT(fl < fh); + OD_ASSERT(fh <= 32768U); + u = fl * (uint32_t)r >> 15; + v = fh * (uint32_t)r >> 15; + r = v - u; + l += u; +#endif + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.); + enc->nb_symbols++; +#endif +} + +/*Encode a single binary value. + val: The value to encode (0 or 1). + {EC_SMALLMUL} f: The probability that the val is one, scaled by 32768. + {else} f: The probability that val is zero, scaled by 32768.*/ +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { + od_ec_window l; + unsigned r; + unsigned v; + OD_ASSERT(0 < f); + OD_ASSERT(f < 32768U); + l = enc->low; + r = enc->rng; + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + v = (r >> 8) * (uint32_t)f >> 7; + if (val) l += r - v; + r = val ? v : r - v; +#else + v = f * (uint32_t)r >> 15; + if (val) l += v; + r = val ? r - v : v; +#endif + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= + OD_LOG2((double)(val ? 32768 - OD_ICDF(f) : OD_ICDF(f)) / 32768.); + enc->nb_symbols++; +#endif +} + +/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. + s: The index of the symbol to encode. + cdf: The CDF, such that symbol s falls in the range + [s > 0 ? cdf[s - 1] : 0, cdf[s]). + The values must be monotonically non-decreasing, and the last value + must be exactly 32768. + nsyms: The number of symbols in the alphabet. + This should be at most 16.*/ +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, + int nsyms) { + (void)nsyms; + OD_ASSERT(s >= 0); + OD_ASSERT(s < nsyms); + OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); + od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : OD_ICDF(0), cdf[s]); +} + +#if CONFIG_RAWBITS +/*Encodes a sequence of raw bits in the stream. + fl: The bits to encode. + ftb: The number of bits to encode. + This must be between 0 and 25, inclusive.*/ +void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) { + od_ec_window end_window; + int nend_bits; + OD_ASSERT(ftb <= 25); + OD_ASSERT(fl < (uint32_t)1 << ftb); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy += ftb; +#endif + end_window = enc->end_window; + nend_bits = enc->nend_bits; + if (nend_bits + ftb > OD_EC_WINDOW_SIZE) { + unsigned char *buf; + uint32_t storage; + uint32_t end_offs; + buf = enc->buf; + storage = enc->storage; + end_offs = enc->end_offs; + if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) { + unsigned char *new_buf; + uint32_t new_storage; + new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3); + new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage); + if (new_buf == NULL) { + enc->error = -1; + enc->end_offs = 0; + return; + } + OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs, + end_offs); + storage = new_storage; + free(buf); + enc->buf = buf = new_buf; + enc->storage = storage; + } + do { + OD_ASSERT(end_offs < storage); + buf[storage - ++end_offs] = (unsigned char)end_window; + end_window >>= 8; + nend_bits -= 8; + } while (nend_bits >= 8); + enc->end_offs = end_offs; + } + OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE); + end_window |= (od_ec_window)fl << nend_bits; + nend_bits += ftb; + enc->end_window = end_window; + enc->nend_bits = nend_bits; +} +#endif + +/*Overwrites a few bits at the very start of an existing stream, after they + have already been encoded. + This makes it possible to have a few flags up front, where it is easy for + decoders to access them without parsing the whole stream, even if their + values are not determined until late in the encoding process, without having + to buffer all the intermediate symbols in the encoder. + In order for this to work, at least nbits bits must have already been encoded + using probabilities that are an exact power of two. + The encoder can verify the number of encoded bits is sufficient, but cannot + check this latter condition. + val: The bits to encode (in the least nbits significant bits). + They will be decoded in order from most-significant to least. + nbits: The number of bits to overwrite. + This must be no more than 8.*/ +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) { + int shift; + unsigned mask; + OD_ASSERT(nbits >= 0); + OD_ASSERT(nbits <= 8); + OD_ASSERT(val < 1U << nbits); + shift = 8 - nbits; + mask = ((1U << nbits) - 1) << shift; + if (enc->offs > 0) { + /*The first byte has been finalized.*/ + enc->precarry_buf[0] = + (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift); + } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) { + /*The first byte has yet to be output.*/ + enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) | + (od_ec_window)val << (16 + enc->cnt + shift); + } else { + /*The encoder hasn't even encoded _nbits of data yet.*/ + enc->error = -1; + } +} + +#if OD_MEASURE_EC_OVERHEAD +#include +#endif + +/*Indicates that there are no more symbols to encode. + All remaining output bytes are flushed to the output buffer. + od_ec_enc_reset() should be called before using the encoder again. + bytes: Returns the size of the encoded data in the returned buffer. + Return: A pointer to the start of the final buffer, or NULL if there was an + encoding error.*/ +unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { + unsigned char *out; + uint32_t storage; + uint16_t *buf; + uint32_t offs; + uint32_t end_offs; + int nend_bits; + od_ec_window m; + od_ec_window e; + od_ec_window l; + unsigned r; + int c; + int s; + if (enc->error) return NULL; +#if OD_MEASURE_EC_OVERHEAD + { + uint32_t tell; + /* Don't count the 1 bit we lose to raw bits as overhead. */ + tell = od_ec_enc_tell(enc) - 1; + fprintf(stderr, "overhead: %f%%\n", + 100 * (tell - enc->entropy) / enc->entropy); + fprintf(stderr, "efficiency: %f bits/symbol\n", + (double)tell / enc->nb_symbols); + } +#endif + /*We output the minimum number of bits that ensures that the symbols encoded + thus far will be decoded correctly regardless of the bits that follow.*/ + l = enc->low; + r = enc->rng; + c = enc->cnt; + s = 9; + m = 0x7FFF; + e = (l + m) & ~m; + while ((e | m) >= l + r) { + s++; + m >>= 1; + e = (l + m) & ~m; + } + s += c; + offs = enc->offs; + buf = enc->precarry_buf; + if (s > 0) { + unsigned n; + storage = enc->precarry_storage; + if (offs + ((s + 7) >> 3) > storage) { + storage = storage * 2 + ((s + 7) >> 3); + buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); + if (buf == NULL) { + enc->error = -1; + return NULL; + } + enc->precarry_buf = buf; + enc->precarry_storage = storage; + } + n = (1 << (c + 16)) - 1; + do { + OD_ASSERT(offs < storage); + buf[offs++] = (uint16_t)(e >> (c + 16)); + e &= n; + s -= 8; + c -= 8; + n >>= 8; + } while (s > 0); + } + /*Make sure there's enough room for the entropy-coded bits and the raw + bits.*/ + out = enc->buf; + storage = enc->storage; + end_offs = enc->end_offs; + e = enc->end_window; + nend_bits = enc->nend_bits; + s = -s; + c = OD_MAXI((nend_bits - s + 7) >> 3, 0); + if (offs + end_offs + c > storage) { + storage = offs + end_offs + c; + out = (unsigned char *)realloc(out, sizeof(*out) * storage); + if (out == NULL) { + enc->error = -1; + return NULL; + } + OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs); + enc->buf = out; + enc->storage = storage; + } + /*If we have buffered raw bits, flush them as well.*/ + while (nend_bits > s) { + OD_ASSERT(end_offs < storage); + out[storage - ++end_offs] = (unsigned char)e; + e >>= 8; + nend_bits -= 8; + } + *nbytes = offs + end_offs; + /*Perform carry propagation.*/ + OD_ASSERT(offs + end_offs <= storage); + out = out + storage - (offs + end_offs); + c = 0; + end_offs = offs; + while (offs > 0) { + offs--; + c = buf[offs] + c; + out[offs] = (unsigned char)c; + c >>= 8; + } + /*Add any remaining raw bits to the last byte. + There is guaranteed to be enough room, because nend_bits <= s.*/ + OD_ASSERT(nend_bits <= 0 || end_offs > 0); + if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e; + /*Note: Unless there's an allocation error, if you keep encoding into the + current buffer and call this function again later, everything will work + just fine (you won't get a new packet out, but you will get a single + buffer with the new data appended to the old). + However, this function is O(N) where N is the amount of data coded so far, + so calling it more than once for a given packet is a bad idea.*/ + return out; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_enc_tell(const od_ec_enc *enc) { + /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra + bit, which we reserve for terminating the stream.*/ + return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { + return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); +} + +/*Saves a entropy coder checkpoint to dst. + This allows an encoder to reverse a series of entropy coder + decisions if it decides that the information would have been + better coded some other way.*/ +void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) { + OD_COPY(dst, src, 1); +} + +/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint. + This can only be used to restore from checkpoints earlier in the target + state's history: you can not switch backwards and forwards or otherwise + switch to a state which isn't a casual ancestor of the current state. + Restore is also incompatible with patching the initial bits, as the + changes will remain in the restored version.*/ +void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) { + unsigned char *buf; + uint32_t storage; + uint16_t *precarry_buf; + uint32_t precarry_storage; + OD_ASSERT(dst->storage >= src->storage); + OD_ASSERT(dst->precarry_storage >= src->precarry_storage); + buf = dst->buf; + storage = dst->storage; + precarry_buf = dst->precarry_buf; + precarry_storage = dst->precarry_storage; + OD_COPY(dst, src, 1); + dst->buf = buf; + dst->storage = storage; + dst->precarry_buf = precarry_buf; + dst->precarry_storage = precarry_storage; +} diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h new file mode 100644 index 000000000..314b36318 --- /dev/null +++ b/third_party/aom/aom_dsp/entenc.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if !defined(_entenc_H) +#define _entenc_H (1) +#include +#include "aom_dsp/entcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct od_ec_enc od_ec_enc; + +#define OD_MEASURE_EC_OVERHEAD (0) + +/*The entropy encoder context.*/ +struct od_ec_enc { + /*Buffered output. + This contains only the raw bits until the final call to od_ec_enc_done(), + where all the arithmetic-coded data gets prepended to it.*/ + unsigned char *buf; + /*The size of the buffer.*/ + uint32_t storage; + /*The offset at which the last byte containing raw bits was written.*/ + uint32_t end_offs; + /*Bits that will be read from/written at the end.*/ + od_ec_window end_window; + /*Number of valid bits in end_window.*/ + int nend_bits; + /*A buffer for output bytes with their associated carry flags.*/ + uint16_t *precarry_buf; + /*The size of the pre-carry buffer.*/ + uint32_t precarry_storage; + /*The offset at which the next entropy-coded byte will be written.*/ + uint32_t offs; + /*The low end of the current range.*/ + od_ec_window low; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; + /*Nonzero if an error occurred.*/ + int error; +#if OD_MEASURE_EC_OVERHEAD + double entropy; + int nb_symbols; +#endif +}; + +/*See entenc.c for further documentation.*/ + +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1); +void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1); +void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1); + +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15) + OD_ARG_NONNULL(1); +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(3); + +void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) + OD_ARG_NONNULL(1); + +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc, + uint32_t *nbytes) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) + OD_ARG_NONNULL(1); + +void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src); +void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c new file mode 100644 index 000000000..09d945afc --- /dev/null +++ b/third_party/aom/aom_dsp/fastssim.c @@ -0,0 +1,493 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Nathan E. Egge, at the Daala + * project. + */ +#include +#include +#include +#include +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/system_state.h" + +typedef struct fs_level fs_level; +typedef struct fs_ctx fs_ctx; + +#define SSIM_C1 (255 * 255 * 0.01 * 0.01) +#define SSIM_C2 (255 * 255 * 0.03 * 0.03) +#if CONFIG_HIGHBITDEPTH +#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) +#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) +#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) +#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) +#endif +#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) +#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) + +struct fs_level { + uint32_t *im1; + uint32_t *im2; + double *ssim; + int w; + int h; +}; + +struct fs_ctx { + fs_level *level; + int nlevels; + unsigned *col_buf; +}; + +static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { + unsigned char *data; + size_t data_size; + int lw; + int lh; + int l; + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + data_size = + _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size += im_size; + level_size *= sizeof(*_ctx->level[l].ssim); + data_size += level_size; + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + data = (unsigned char *)malloc(data_size); + _ctx->level = (fs_level *)data; + _ctx->nlevels = _nlevels; + data += _nlevels * sizeof(*_ctx->level); + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + _ctx->level[l].w = lw; + _ctx->level[l].h = lh; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size *= sizeof(*_ctx->level[l].ssim); + _ctx->level[l].im1 = (uint32_t *)data; + _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; + data += level_size; + _ctx->level[l].ssim = (double *)data; + data += im_size * sizeof(*_ctx->level[l].ssim); + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + _ctx->col_buf = (unsigned *)data; +} + +static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } + +static void fs_downsample_level(fs_ctx *_ctx, int _l) { + const uint32_t *src1; + const uint32_t *src2; + uint32_t *dst1; + uint32_t *dst2; + int w2; + int h2; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + dst1 = _ctx->level[_l].im1; + dst2 = _ctx->level[_l].im2; + w2 = _ctx->level[_l - 1].w; + h2 = _ctx->level[_l - 1].h; + src1 = _ctx->level[_l - 1].im1; + src2 = _ctx->level[_l - 1].im2; + for (j = 0; j < h; j++) { + int j0offs; + int j1offs; + j0offs = 2 * j * w2; + j1offs = FS_MINI(2 * j + 1, h2) * w2; + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, w2); + dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]; + dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]; + } + } +} + +static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, + int _s1ystride, const uint8_t *_src2, + int _s2ystride, int _w, int _h, uint32_t bd, + uint32_t shift) { + uint32_t *dst1; + uint32_t *dst2; + int w; + int h; + int i; + int j; + w = _ctx->level[0].w; + h = _ctx->level[0].h; + dst1 = _ctx->level[0].im1; + dst2 = _ctx->level[0].im2; + for (j = 0; j < h; j++) { + int j0; + int j1; + j0 = 2 * j; + j1 = FS_MINI(j0 + 1, _h); + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, _w); + if (bd == 8 && shift == 0) { + dst1[j * w + i] = + _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; + dst2[j * w + i] = + _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + + _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; + } else { + uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); + uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); + dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + + (src1s[j0 * _s1ystride + i1] >> shift) + + (src1s[j1 * _s1ystride + i0] >> shift) + + (src1s[j1 * _s1ystride + i1] >> shift); + dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + + (src2s[j0 * _s2ystride + i1] >> shift) + + (src2s[j1 * _s2ystride + i0] >> shift) + + (src2s[j1 * _s2ystride + i1] >> shift); + } + } + } +} + +static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { + unsigned *col_sums_x; + unsigned *col_sums_y; + uint32_t *im1; + uint32_t *im2; + double *ssim; + double c1; + int w; + int h; + int j0offs; + int j1offs; + int i; + int j; + double ssim_c1 = SSIM_C1; +#if CONFIG_HIGHBITDEPTH + if (bit_depth == 10) ssim_c1 = SSIM_C1_10; + if (bit_depth == 12) ssim_c1 = SSIM_C1_12; +#else + assert(bit_depth == 8); + (void)bit_depth; +#endif + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + col_sums_x = _ctx->col_buf; + col_sums_y = col_sums_x + w; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; + for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; + for (j = 1; j < 4; j++) { + j1offs = FS_MINI(j, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + ssim = _ctx->level[_l].ssim; + c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); + for (j = 0; j < h; j++) { + unsigned mux; + unsigned muy; + int i0; + int i1; + mux = 5 * col_sums_x[0]; + muy = 5 * col_sums_y[0]; + for (i = 1; i < 4; i++) { + i1 = FS_MINI(i, w - 1); + mux += col_sums_x[i1]; + muy += col_sums_y[i1]; + } + for (i = 0; i < w; i++) { + ssim[j * w + i] *= (2 * mux * (double)muy + c1) / + (mux * (double)mux + muy * (double)muy + c1); + if (i + 1 < w) { + i0 = FS_MAXI(0, i - 4); + i1 = FS_MINI(i + 4, w - 1); + mux += col_sums_x[i1] - col_sums_x[i0]; + muy += col_sums_x[i1] - col_sums_x[i0]; + } + } + if (j + 1 < h) { + j0offs = FS_MAXI(0, j - 4) * w; + for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; + j1offs = FS_MINI(j + 4, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + } +} + +#define FS_COL_SET(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] = gx * (double)gx; \ + col_sums_gy2[(_col)] = gy * (double)gy; \ + col_sums_gxgy[(_col)] = gx * (double)gy; \ + } while (0) + +#define FS_COL_ADD(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] += gx * (double)gx; \ + col_sums_gy2[(_col)] += gy * (double)gy; \ + col_sums_gxgy[(_col)] += gx * (double)gy; \ + } while (0) + +#define FS_COL_SUB(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] -= gx * (double)gx; \ + col_sums_gy2[(_col)] -= gy * (double)gy; \ + col_sums_gxgy[(_col)] -= gx * (double)gy; \ + } while (0) + +#define FS_COL_COPY(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ + } while (0) + +#define FS_COL_HALVE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ + } while (0) + +#define FS_COL_DOUBLE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ + } while (0) + +static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { + uint32_t *im1; + uint32_t *im2; + unsigned *gx_buf; + unsigned *gy_buf; + double *ssim; + double col_sums_gx2[8]; + double col_sums_gy2[8]; + double col_sums_gxgy[8]; + double c2; + int stride; + int w; + int h; + int i; + int j; + double ssim_c2 = SSIM_C2; +#if CONFIG_HIGHBITDEPTH + if (bit_depth == 10) ssim_c2 = SSIM_C2_10; + if (bit_depth == 12) ssim_c2 = SSIM_C2_12; +#else + assert(bit_depth == 8); + (void)bit_depth; +#endif + + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + ssim = _ctx->level[_l].ssim; + gx_buf = _ctx->col_buf; + stride = w + 8; + gy_buf = gx_buf + 8 * stride; + memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf)); + c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104; + for (j = 0; j < h + 4; j++) { + if (j < h - 1) { + for (i = 0; i < w - 1; i++) { + unsigned g1; + unsigned g2; + unsigned gx; + unsigned gy; + g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); + g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); + gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); + g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); + gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + gx_buf[(j & 7) * stride + i + 4] = gx; + gy_buf[(j & 7) * stride + i + 4] = gy; + } + } else { + memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); + memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf)); + } + if (j >= 4) { + int k; + col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0; + col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0; + col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] = + col_sums_gxgy[0] = 0; + for (i = 4; i < 8; i++) { + FS_COL_SET(i, -1, 0); + FS_COL_ADD(i, 0, 0); + for (k = 1; k < 8 - i; k++) { + FS_COL_DOUBLE(i, i); + FS_COL_ADD(i, -k - 1, 0); + FS_COL_ADD(i, k, 0); + } + } + for (i = 0; i < w; i++) { + double mugx2; + double mugy2; + double mugxgy; + mugx2 = col_sums_gx2[0]; + for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; + mugy2 = col_sums_gy2[0]; + for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; + mugxgy = col_sums_gxgy[0]; + for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; + ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); + if (i + 1 < w) { + FS_COL_SET(0, -1, 1); + FS_COL_ADD(0, 0, 1); + FS_COL_SUB(2, -3, 2); + FS_COL_SUB(2, 2, 2); + FS_COL_HALVE(1, 2); + FS_COL_SUB(3, -4, 3); + FS_COL_SUB(3, 3, 3); + FS_COL_HALVE(2, 3); + FS_COL_COPY(3, 4); + FS_COL_DOUBLE(4, 5); + FS_COL_ADD(4, -4, 5); + FS_COL_ADD(4, 3, 5); + FS_COL_DOUBLE(5, 6); + FS_COL_ADD(5, -3, 6); + FS_COL_ADD(5, 2, 6); + FS_COL_DOUBLE(6, 7); + FS_COL_ADD(6, -2, 7); + FS_COL_ADD(6, 1, 7); + FS_COL_SET(7, -1, 8); + FS_COL_ADD(7, 0, 8); + } + } + } + } +} + +#define FS_NLEVELS (4) + +/*These weights were derived from the default weights found in Wang's original + Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. + We drop the finest scale and renormalize the rest to sum to 1.*/ + +static const double FS_WEIGHTS[FS_NLEVELS] = { + 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 +}; + +static double fs_average(fs_ctx *_ctx, int _l) { + double *ssim; + double ret; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + ssim = _ctx->level[_l].ssim; + ret = 0; + for (j = 0; j < h; j++) + for (i = 0; i < w; i++) ret += ssim[j * w + i]; + return pow(ret / (w * h), FS_WEIGHTS[_l]); +} + +static double convert_ssim_db(double _ssim, double _weight) { + assert(_weight >= _ssim); + if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; + return 10 * (log10(_weight) - log10(_weight - _ssim)); +} + +static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, + int _dystride, int _w, int _h, uint32_t _bd, + uint32_t _shift) { + fs_ctx ctx; + double ret; + int l; + ret = 1; + fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); + fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd, + _shift); + for (l = 0; l < FS_NLEVELS - 1; l++) { + fs_calc_structure(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_downsample_level(&ctx, l + 1); + } + fs_calc_structure(&ctx, l, _bd); + fs_apply_luminance(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_ctx_clear(&ctx); + return ret; +} + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd) { + double ssimv; + uint32_t bd_shift = 0; + aom_clear_system_state(); + assert(bd >= in_bd); + + bd_shift = bd - in_bd; + + *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, bd_shift); + *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift); + *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift); + ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); + return convert_ssim_db(ssimv, 1.0); +} diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c new file mode 100644 index 000000000..12ee02ba1 --- /dev/null +++ b/third_party/aom/aom_dsp/fwd_txfm.c @@ -0,0 +1,809 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/fwd_txfm.h" +#include +#include "./aom_dsp_rtcd.h" + +void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + tran_low_t intermediate[4 * 4]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + tran_high_t in_high[4]; // canbe16 + tran_high_t step[4]; // canbe16 + tran_high_t temp1, temp2; // needs32 + int i; + for (i = 0; i < 4; ++i) { + // Load inputs. + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; + } + } else { + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; + } + // Transform. + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + out[0] = (tran_low_t)fdct_round_shift(temp1); + out[2] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[3] = (tran_low_t)fdct_round_shift(temp2); + // Do next column (which is a transposed row in second/horizontal pass) + ++input; + out += 4; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } + + { + int i, j; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } + } +} + +void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 4; ++r) + for (c = 0; c < 4; ++c) sum += input[r * stride + c]; + + output[0] = sum << 1; +} + +void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { + int i, j; + tran_low_t intermediate[64]; + int pass; + tran_low_t *output = intermediate; + const tran_low_t *in = NULL; + + // Transform columns + for (pass = 0; pass < 2; ++pass) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + for (i = 0; i < 8; i++) { + // stage 1 + if (pass == 0) { + s0 = (input[0 * stride] + input[7 * stride]) * 4; + s1 = (input[1 * stride] + input[6 * stride]) * 4; + s2 = (input[2 * stride] + input[5 * stride]) * 4; + s3 = (input[3 * stride] + input[4 * stride]) * 4; + s4 = (input[3 * stride] - input[4 * stride]) * 4; + s5 = (input[2 * stride] - input[5 * stride]) * 4; + s6 = (input[1 * stride] - input[6 * stride]) * 4; + s7 = (input[0 * stride] - input[7 * stride]) * 4; + ++input; + } else { + s0 = in[0 * 8] + in[7 * 8]; + s1 = in[1 * 8] + in[6 * 8]; + s2 = in[2 * 8] + in[5 * 8]; + s3 = in[3 * 8] + in[4 * 8]; + s4 = in[3 * 8] - in[4 * 8]; + s5 = in[2 * 8] - in[5 * 8]; + s6 = in[1 * 8] - in[6 * 8]; + s7 = in[0 * 8] - in[7 * 8]; + ++in; + } + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0] = (tran_low_t)fdct_round_shift(t0); + output[2] = (tran_low_t)fdct_round_shift(t2); + output[4] = (tran_low_t)fdct_round_shift(t1); + output[6] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1] = (tran_low_t)fdct_round_shift(t0); + output[3] = (tran_low_t)fdct_round_shift(t2); + output[5] = (tran_low_t)fdct_round_shift(t1); + output[7] = (tran_low_t)fdct_round_shift(t3); + output += 8; + } + in = intermediate; + output = final_output; + } + + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; + } +} + +void aom_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 8; ++r) + for (c = 0; c < 8; ++c) sum += input[r * stride + c]; + + output[0] = sum; +} + +void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + tran_low_t intermediate[256]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + tran_high_t step1[8]; // canbe16 + tran_high_t step2[8]; // canbe16 + tran_high_t step3[8]; // canbe16 + tran_high_t in_high[8]; // canbe16 + tran_high_t temp1, temp2; // needs32 + int i; + for (i = 0; i < 16; i++) { + if (0 == pass) { + // Calculate input for the first 8 results. + in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; + in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; + in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; + in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; + in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; + in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; + in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; + in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; + // Calculate input for the next 8 results. + step1[0] = (input[7 * stride] - input[8 * stride]) * 4; + step1[1] = (input[6 * stride] - input[9 * stride]) * 4; + step1[2] = (input[5 * stride] - input[10 * stride]) * 4; + step1[3] = (input[4 * stride] - input[11 * stride]) * 4; + step1[4] = (input[3 * stride] - input[12 * stride]) * 4; + step1[5] = (input[2 * stride] - input[13 * stride]) * 4; + step1[6] = (input[1 * stride] - input[14 * stride]) * 4; + step1[7] = (input[0 * stride] - input[15 * stride]) * 4; + } else { + // Calculate input for the first 8 results. + assert(in_low != NULL); + in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); + in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); + in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); + in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); + in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); + in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); + in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); + in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); + // Calculate input for the next 8 results. + step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); + step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); + step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); + step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); + step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); + step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); + step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); + step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); + in_low++; + } + // Work on the first eight values; fdct8(input, even_results); + { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + // stage 1 + s0 = in_high[0] + in_high[7]; + s1 = in_high[1] + in_high[6]; + s2 = in_high[2] + in_high[5]; + s3 = in_high[3] + in_high[4]; + s4 = in_high[3] - in_high[4]; + s5 = in_high[2] - in_high[5]; + s6 = in_high[1] - in_high[6]; + s7 = in_high[0] - in_high[7]; + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = (tran_low_t)fdct_round_shift(t0); + out[4] = (tran_low_t)fdct_round_shift(t2); + out[8] = (tran_low_t)fdct_round_shift(t1); + out[12] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = (tran_low_t)fdct_round_shift(t0); + out[6] = (tran_low_t)fdct_round_shift(t2); + out[10] = (tran_low_t)fdct_round_shift(t1); + out[14] = (tran_low_t)fdct_round_shift(t3); + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = fdct_round_shift(temp1); + step2[3] = fdct_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = fdct_round_shift(temp1); + step2[5] = fdct_round_shift(temp2); + // step 3 + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; + // step 4 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; + step2[1] = fdct_round_shift(temp1); + step2[2] = fdct_round_shift(temp2); + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = fdct_round_shift(temp1); + step2[6] = fdct_round_shift(temp2); + // step 5 + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; + // step 6 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[9] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = (tran_low_t)fdct_round_shift(temp1); + out[13] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = (tran_low_t)fdct_round_shift(temp1); + out[11] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = (tran_low_t)fdct_round_shift(temp1); + out[15] = (tran_low_t)fdct_round_shift(temp2); + } + // Do next column (which is a transposed row in second/horizontal pass) + input++; + out += 16; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } +} + +void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + int sum = 0; + for (r = 0; r < 16; ++r) + for (c = 0; c < 16; ++c) sum += input[r * stride + c]; + + output[0] = (tran_low_t)(sum >> 1); +} + +static INLINE tran_high_t dct_32_round(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + // TODO(debargha, peter.derivaz): Find new bounds for this assert, + // and make the bounds consts. + // assert(-131072 <= rv && rv <= 131071); + return rv; +} + +static INLINE tran_high_t half_round_shift(tran_high_t input) { + tran_high_t rv = (input + 1 + (input < 0)) >> 2; + return rv; +} + +void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) { + tran_high_t step[32]; + // Stage 1 + step[0] = input[0] + input[(32 - 1)]; + step[1] = input[1] + input[(32 - 2)]; + step[2] = input[2] + input[(32 - 3)]; + step[3] = input[3] + input[(32 - 4)]; + step[4] = input[4] + input[(32 - 5)]; + step[5] = input[5] + input[(32 - 6)]; + step[6] = input[6] + input[(32 - 7)]; + step[7] = input[7] + input[(32 - 8)]; + step[8] = input[8] + input[(32 - 9)]; + step[9] = input[9] + input[(32 - 10)]; + step[10] = input[10] + input[(32 - 11)]; + step[11] = input[11] + input[(32 - 12)]; + step[12] = input[12] + input[(32 - 13)]; + step[13] = input[13] + input[(32 - 14)]; + step[14] = input[14] + input[(32 - 15)]; + step[15] = input[15] + input[(32 - 16)]; + step[16] = -input[16] + input[(32 - 17)]; + step[17] = -input[17] + input[(32 - 18)]; + step[18] = -input[18] + input[(32 - 19)]; + step[19] = -input[19] + input[(32 - 20)]; + step[20] = -input[20] + input[(32 - 21)]; + step[21] = -input[21] + input[(32 - 22)]; + step[22] = -input[22] + input[(32 - 23)]; + step[23] = -input[23] + input[(32 - 24)]; + step[24] = -input[24] + input[(32 - 25)]; + step[25] = -input[25] + input[(32 - 26)]; + step[26] = -input[26] + input[(32 - 27)]; + step[27] = -input[27] + input[(32 - 28)]; + step[28] = -input[28] + input[(32 - 29)]; + step[29] = -input[29] + input[(32 - 30)]; + step[30] = -input[30] + input[(32 - 31)]; + step[31] = -input[31] + input[(32 - 32)]; + + // Stage 2 + output[0] = step[0] + step[16 - 1]; + output[1] = step[1] + step[16 - 2]; + output[2] = step[2] + step[16 - 3]; + output[3] = step[3] + step[16 - 4]; + output[4] = step[4] + step[16 - 5]; + output[5] = step[5] + step[16 - 6]; + output[6] = step[6] + step[16 - 7]; + output[7] = step[7] + step[16 - 8]; + output[8] = -step[8] + step[16 - 9]; + output[9] = -step[9] + step[16 - 10]; + output[10] = -step[10] + step[16 - 11]; + output[11] = -step[11] + step[16 - 12]; + output[12] = -step[12] + step[16 - 13]; + output[13] = -step[13] + step[16 - 14]; + output[14] = -step[14] + step[16 - 15]; + output[15] = -step[15] + step[16 - 16]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = step[18]; + output[19] = step[19]; + + output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); + output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); + output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); + output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); + + output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); + output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); + output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); + output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); + + output[28] = step[28]; + output[29] = step[29]; + output[30] = step[30]; + output[31] = step[31]; + + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (round) { + output[0] = half_round_shift(output[0]); + output[1] = half_round_shift(output[1]); + output[2] = half_round_shift(output[2]); + output[3] = half_round_shift(output[3]); + output[4] = half_round_shift(output[4]); + output[5] = half_round_shift(output[5]); + output[6] = half_round_shift(output[6]); + output[7] = half_round_shift(output[7]); + output[8] = half_round_shift(output[8]); + output[9] = half_round_shift(output[9]); + output[10] = half_round_shift(output[10]); + output[11] = half_round_shift(output[11]); + output[12] = half_round_shift(output[12]); + output[13] = half_round_shift(output[13]); + output[14] = half_round_shift(output[14]); + output[15] = half_round_shift(output[15]); + + output[16] = half_round_shift(output[16]); + output[17] = half_round_shift(output[17]); + output[18] = half_round_shift(output[18]); + output[19] = half_round_shift(output[19]); + output[20] = half_round_shift(output[20]); + output[21] = half_round_shift(output[21]); + output[22] = half_round_shift(output[22]); + output[23] = half_round_shift(output[23]); + output[24] = half_round_shift(output[24]); + output[25] = half_round_shift(output[25]); + output[26] = half_round_shift(output[26]); + output[27] = half_round_shift(output[27]); + output[28] = half_round_shift(output[28]); + output[29] = half_round_shift(output[29]); + output[30] = half_round_shift(output[30]); + output[31] = half_round_shift(output[31]); + } + + // Stage 3 + step[0] = output[0] + output[(8 - 1)]; + step[1] = output[1] + output[(8 - 2)]; + step[2] = output[2] + output[(8 - 3)]; + step[3] = output[3] + output[(8 - 4)]; + step[4] = -output[4] + output[(8 - 5)]; + step[5] = -output[5] + output[(8 - 6)]; + step[6] = -output[6] + output[(8 - 7)]; + step[7] = -output[7] + output[(8 - 8)]; + step[8] = output[8]; + step[9] = output[9]; + step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); + step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); + step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); + step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); + step[14] = output[14]; + step[15] = output[15]; + + step[16] = output[16] + output[23]; + step[17] = output[17] + output[22]; + step[18] = output[18] + output[21]; + step[19] = output[19] + output[20]; + step[20] = -output[20] + output[19]; + step[21] = -output[21] + output[18]; + step[22] = -output[22] + output[17]; + step[23] = -output[23] + output[16]; + step[24] = -output[24] + output[31]; + step[25] = -output[25] + output[30]; + step[26] = -output[26] + output[29]; + step[27] = -output[27] + output[28]; + step[28] = output[28] + output[27]; + step[29] = output[29] + output[26]; + step[30] = output[30] + output[25]; + step[31] = output[31] + output[24]; + + // Stage 4 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = -step[2] + step[1]; + output[3] = -step[3] + step[0]; + output[4] = step[4]; + output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); + output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); + output[7] = step[7]; + output[8] = step[8] + step[11]; + output[9] = step[9] + step[10]; + output[10] = -step[10] + step[9]; + output[11] = -step[11] + step[8]; + output[12] = -step[12] + step[15]; + output[13] = -step[13] + step[14]; + output[14] = step[14] + step[13]; + output[15] = step[15] + step[12]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); + output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); + output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); + output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); + output[22] = step[22]; + output[23] = step[23]; + output[24] = step[24]; + output[25] = step[25]; + output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); + output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); + output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); + output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); + output[30] = step[30]; + output[31] = step[31]; + + // Stage 5 + step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); + step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); + step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); + step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); + step[4] = output[4] + output[5]; + step[5] = -output[5] + output[4]; + step[6] = -output[6] + output[7]; + step[7] = output[7] + output[6]; + step[8] = output[8]; + step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); + step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); + step[11] = output[11]; + step[12] = output[12]; + step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); + step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); + step[15] = output[15]; + + step[16] = output[16] + output[19]; + step[17] = output[17] + output[18]; + step[18] = -output[18] + output[17]; + step[19] = -output[19] + output[16]; + step[20] = -output[20] + output[23]; + step[21] = -output[21] + output[22]; + step[22] = output[22] + output[21]; + step[23] = output[23] + output[20]; + step[24] = output[24] + output[27]; + step[25] = output[25] + output[26]; + step[26] = -output[26] + output[25]; + step[27] = -output[27] + output[24]; + step[28] = -output[28] + output[31]; + step[29] = -output[29] + output[30]; + step[30] = output[30] + output[29]; + step[31] = output[31] + output[28]; + + // Stage 6 + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); + output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); + output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); + output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); + output[8] = step[8] + step[9]; + output[9] = -step[9] + step[8]; + output[10] = -step[10] + step[11]; + output[11] = step[11] + step[10]; + output[12] = step[12] + step[13]; + output[13] = -step[13] + step[12]; + output[14] = -step[14] + step[15]; + output[15] = step[15] + step[14]; + + output[16] = step[16]; + output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); + output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); + output[19] = step[19]; + output[20] = step[20]; + output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); + output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); + output[23] = step[23]; + output[24] = step[24]; + output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); + output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); + output[27] = step[27]; + output[28] = step[28]; + output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); + output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); + output[31] = step[31]; + + // Stage 7 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); + step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); + step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); + step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); + step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); + step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); + step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); + step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); + + step[16] = output[16] + output[17]; + step[17] = -output[17] + output[16]; + step[18] = -output[18] + output[19]; + step[19] = output[19] + output[18]; + step[20] = output[20] + output[21]; + step[21] = -output[21] + output[20]; + step[22] = -output[22] + output[23]; + step[23] = output[23] + output[22]; + step[24] = output[24] + output[25]; + step[25] = -output[25] + output[24]; + step[26] = -output[26] + output[27]; + step[27] = output[27] + output[26]; + step[28] = output[28] + output[29]; + step[29] = -output[29] + output[28]; + step[30] = -output[30] + output[31]; + step[31] = output[31] + output[30]; + + // Final stage --- outputs indices are bit-reversed. + output[0] = step[0]; + output[16] = step[1]; + output[8] = step[2]; + output[24] = step[3]; + output[4] = step[4]; + output[20] = step[5]; + output[12] = step[6]; + output[28] = step[7]; + output[2] = step[8]; + output[18] = step[9]; + output[10] = step[10]; + output[26] = step[11]; + output[6] = step[12]; + output[22] = step[13]; + output[14] = step[14]; + output[30] = step[15]; + + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); + output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); + output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); + output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); + output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); + output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); + output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); + output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); + output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); + output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); + output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); + output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); +} + +void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { + int i, j; + tran_high_t output[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} + +// Note that although we use dct_32_round in dct32 computation flow, +// this 2d fdct32x32 for rate-distortion optimization loop is operating +// within 16 bits precision. +void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { + int i, j; + tran_high_t output[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + // TODO(cd): see quality impact of only doing + // output[j * 32 + i] = (temp_out[j] + 1) >> 2; + // PS: also change code in aom_dsp/x86/aom_dct_sse2.c + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + aom_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + } +} + +void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + int sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) sum += input[r * stride + c]; + + output[0] = (tran_low_t)(sum >> 3); +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, + int stride) { + aom_fdct4x4_c(input, output, stride); +} + +void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, + int stride) { + aom_fdct8x8_c(input, final_output, stride); +} + +void aom_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, + int stride) { + aom_fdct8x8_1_c(input, final_output, stride); +} + +void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, + int stride) { + aom_fdct16x16_c(input, output, stride); +} + +void aom_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, + int stride) { + aom_fdct16x16_1_c(input, output, stride); +} + +void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { + aom_fdct32x32_c(input, out, stride); +} + +void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, + int stride) { + aom_fdct32x32_rd_c(input, out, stride); +} + +void aom_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, + int stride) { + aom_fdct32x32_1_c(input, out, stride); +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/fwd_txfm.h b/third_party/aom/aom_dsp/fwd_txfm.h new file mode 100644 index 000000000..579dbd06e --- /dev/null +++ b/third_party/aom/aom_dsp/fwd_txfm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_FWD_TXFM_H_ +#define AOM_DSP_FWD_TXFM_H_ + +#include "aom_dsp/txfm_common.h" + +static INLINE tran_high_t saturate_int16(tran_high_t value) { + tran_high_t result; + result = value > INT16_MAX ? INT16_MAX : value; + return result < INT16_MIN ? INT16_MIN : result; +} + +static INLINE tran_high_t fdct_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return rv; +} + +void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round); +#endif // AOM_DSP_FWD_TXFM_H_ diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c new file mode 100644 index 000000000..1f0870b64 --- /dev/null +++ b/third_party/aom/aom_dsp/intrapred.c @@ -0,0 +1,971 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" + +#define DST(x, y) dst[(x) + (y)*stride] +#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) +#define AVG2(a, b) (((a) + (b) + 1) >> 1) + +static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void)above; + + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], + left[(c >> 1) + r + 2]) + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + } + dst += stride; + } +} + +static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void)left; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], + above[(r >> 1) + c + 2]) + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + } + dst += stride; + } +} + +static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void)left; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = AVG3(above[r + c], above[r + c + 1], + above[r + c + 1 + (r + c + 2 < bs * 2)]); + } + dst += stride; + } +} + +static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + + // first row + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i; +#if CONFIG_TX64X64 +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint8_t border[133]; +#else + uint8_t border[64 + 64 - 1]; // outer border from bottom-left to top-right +#endif +#else +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint8_t border[69]; +#else + uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right +#endif +#endif // CONFIG_TX64X64 + + // dst(bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) { + border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = AVG3(above[-1], left[0], left[1]); + border[bs - 1] = AVG3(left[0], above[-1], above[0]); + border[bs - 0] = AVG3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) { + border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + } + + for (i = 0; i < bs; ++i) { + memcpy(dst + i * stride, border + bs - 1 - i, bs); + } +} + +static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)left; + + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs); + dst += stride; + } +} + +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + + for (r = 0; r < bs; r++) { + memset(dst, left[r], bs); + dst += stride; + } +} + +#if CONFIG_ALT_INTRA +static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } + +static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, + uint16_t top_left) { + const int base = top + left - top_left; + const int p_left = abs_diff(base, left); + const int p_top = abs_diff(base, top); + const int p_top_left = abs_diff(base, top_left); + + // Return nearest to base of left, top and top_left. + return (p_left <= p_top && p_left <= p_top_left) + ? left + : (p_top <= p_top_left) ? top : top_left; +} + +static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + const uint8_t ytop_left = above[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +// Weights are quadratic from '1' to '1 / block_size', scaled by +// 2^sm_weight_log2_scale. +static const int sm_weight_log2_scale = 8; + +#if CONFIG_TX64X64 +// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) +#define MAX_BLOCK_DIM 64 +#define NUM_BLOCK_DIMS 6 // log2(MAX_BLOCK_DIM) +#else +#define MAX_BLOCK_DIM 32 +#define NUM_BLOCK_DIMS 5 +#endif // CONFIG_TX64X64 + +static const uint8_t sm_weight_arrays[NUM_BLOCK_DIMS][MAX_BLOCK_DIM] = { + // bs = 2 + { 255, 128 }, + // bs = 4 + { 255, 149, 85, 64 }, + // bs = 8 + { 255, 197, 146, 105, 73, 50, 37, 32 }, + // bs = 16 + { 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16 }, + // bs = 32 + { + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, + 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34, + 29, 25, 21, 17, 14, 12, 10, 9, 8, 8 }, +#if CONFIG_TX64X64 + // bs = 64 + { 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, + 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, + 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44, + 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13, + 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 }, +#endif // CONFIG_TX64X64 +}; + +// Some basic checks on weights for smooth predictor. +#define sm_weights_sanity_checks(weights, weights_scale, pred_scale) \ + assert(weights[0] < weights_scale); \ + assert(weights_scale - weights[bs - 1] < weights_scale); \ + assert(pred_scale < 31) // ensures no overflow when calculating predictor. + +#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) + +static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel + const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel + const int arr_index = get_msb(bs) - 1; + assert(arr_index >= 0); + assert(arr_index < NUM_BLOCK_DIMS); + const uint8_t *const sm_weights = sm_weight_arrays[arr_index]; + // scale = 2 * 2^sm_weight_log2_scale + const int log2_scale = 1 + sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bs; ++r) { + int c; + for (c = 0; c < bs; ++c) { + const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r], + sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights[r] && scale >= sm_weights[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); + } + dst += stride; + } +} + +#else + +static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + int ytop_left = above[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel(left[r] + above[c] - ytop_left); + dst += stride; + } +} +#endif // CONFIG_ALT_INTRA + +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bs; r++) { + memset(dst, 128, bs); + dst += stride; + } +} + +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)above; + + for (i = 0; i < bs; i++) sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)left; + + for (i = 0; i < bs; i++) sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + (void)left; + + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(1, 1) = AVG3(C, D, D); +} + +void aom_d117_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + DST(0, 0) = AVG2(X, A); + DST(1, 0) = AVG2(A, B); + DST(0, 1) = AVG3(I, X, A); + DST(1, 1) = AVG3(X, A, B); +} + +void aom_d135_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + (void)stride; + DST(0, 1) = AVG3(X, I, J); + DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(1, 0) = AVG3(B, A, X); +} + +void aom_d153_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + + DST(0, 0) = AVG2(I, X); + DST(0, 1) = AVG2(J, I); + DST(1, 0) = AVG3(I, X, A); + DST(1, 1) = AVG3(J, I, X); +} + +void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)stride; + (void)left; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(G, H, H); +} + +void aom_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +void aom_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} + +#if CONFIG_HIGHBITDEPTH +static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)above; + (void)bd; + + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], + left[(c >> 1) + r + 2]) + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + } + dst += stride; + } +} + +static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)left; + (void)bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], + above[(r >> 1) + c + 2]) + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + } + dst += stride; + } +} + +static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)left; + (void)bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = AVG3(above[r + c], above[r + c + 1], + above[r + c + 1 + (r + c + 2 < bs * 2)]); + } + dst += stride; + } +} + +static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + + // first row + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; ++r) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + + dst += stride; + for (r = 1; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1]; + dst += stride; + } +} + +static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)left; + (void)bd; + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)bd; + for (r = 0; r < bs; r++) { + aom_memset16(dst, left[r], bs); + dst += stride; + } +} + +void aom_highbd_d207_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + (void)bd; + DST(0, 0) = AVG2(I, J); + DST(0, 1) = AVG2(J, K); + DST(1, 0) = AVG3(I, J, K); + DST(1, 1) = AVG3(J, K, L); +} + +void aom_highbd_d63_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)left; + (void)bd; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = AVG2(B, C); + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = AVG3(B, C, D); +} + +void aom_highbd_d45e_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + (void)left; + (void)bd; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(1, 1) = AVG3(C, D, D); +} + +void aom_highbd_d117_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + (void)bd; + DST(0, 0) = AVG2(X, A); + DST(1, 0) = AVG2(A, B); + DST(0, 1) = AVG3(I, X, A); + DST(1, 1) = AVG3(X, A, B); +} + +void aom_highbd_d135_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + (void)bd; + DST(0, 1) = AVG3(X, I, J); + DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(1, 0) = AVG3(B, A, X); +} + +void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + (void)bd; + DST(0, 0) = AVG2(I, X); + DST(0, 1) = AVG2(J, I); + DST(1, 0) = AVG3(I, X, A); + DST(1, 1) = AVG3(J, I, X); +} + +#if CONFIG_ALT_INTRA +static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + const uint16_t ytop_left = above[-1]; + (void)bd; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel + const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel + const int arr_index = get_msb(bs) - 1; + assert(arr_index >= 0); + assert(arr_index < NUM_BLOCK_DIMS); + const uint8_t *const sm_weights = sm_weight_arrays[arr_index]; + // scale = 2 * 2^sm_weight_log2_scale + const int log2_scale = 1 + sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bs; ++r) { + int c; + for (c = 0; c < bs; ++c) { + const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r], + sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights[r] && scale >= sm_weights[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); + } + dst += stride; + } +} + +#else +static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + int ytop_left = above[-1]; + (void)bd; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); + dst += stride; + } +} +#endif // CONFIG_ALT_INTRA + +static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, 128 << (bd - 8), bs); + dst += stride; + } +} + +static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)above; + (void)bd; + + for (i = 0; i < bs; i++) sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)left; + (void)bd; + + for (i = 0; i < bs; i++) sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + (void)bd; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, expected_dc, bs); + dst += stride; + } +} +#endif // CONFIG_HIGHBITDEPTH + +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. +#define intra_pred_sized(type, size) \ + void aom_##type##_predictor_##size##x##size##_c( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, size, above, left); \ + } + +#if CONFIG_HIGHBITDEPTH +#define intra_pred_highbd_sized(type, size) \ + void aom_highbd_##type##_predictor_##size##x##size##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + highbd_##type##_predictor(dst, stride, size, above, left, bd); \ + } + +/* clang-format off */ +#if CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) \ + intra_pred_highbd_sized(type, 64) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) \ + intra_pred_highbd_sized(type, 64) +#else // CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 2) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) +#endif // CONFIG_TX64X64 + +#else + +#if CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) +#else // CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH + +intra_pred_allsizes(d207e) +intra_pred_allsizes(d63e) +intra_pred_above_4x4(d45e) +intra_pred_above_4x4(d117) +intra_pred_above_4x4(d135) +intra_pred_above_4x4(d153) +intra_pred_allsizes(v) +intra_pred_allsizes(h) +#if CONFIG_ALT_INTRA +intra_pred_allsizes(paeth) +intra_pred_allsizes(smooth) +#else +intra_pred_allsizes(tm) +#endif // CONFIG_ALT_INTRA +intra_pred_allsizes(dc_128) +intra_pred_allsizes(dc_left) +intra_pred_allsizes(dc_top) +intra_pred_allsizes(dc) +/* clang-format on */ +#undef intra_pred_allsizes diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c new file mode 100644 index 000000000..bb995856a --- /dev/null +++ b/third_party/aom/aom_dsp/inv_txfm.c @@ -0,0 +1,1445 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/inv_txfm.h" + +void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WRAPLOW(a1); + op[1] = WRAPLOW(b1); + op[2] = WRAPLOW(c1); + op[3] = WRAPLOW(d1); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); + + ip++; + dest++; + } +} + +void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WRAPLOW(a1); + op[1] = op[2] = op[3] = WRAPLOW(e1); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); + dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); + dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); + dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); + ip++; + dest++; + } +} + +void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); +} + +void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + aom_idct4_c(input, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + aom_idct4_c(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, + int dest_stride) { + int i; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 4); + + if (a1 == 0) return; + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel_add(dest[0], a1); + dest[1] = clip_pixel_add(dest[1], a1); + dest[2] = clip_pixel_add(dest[2], a1); + dest[3] = clip_pixel_add(dest[3], a1); + dest += dest_stride; + } +} + +void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + temp1 = (step1[0] + step1[2]) * cospi_16_64; + temp2 = (step1[0] - step1[2]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + // stage 3 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); +} + +void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + aom_idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + aom_idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + if (a1 == 0) return; + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = WRAPLOW(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); +} + +void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); + s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); + s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); + s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); + s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); + s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); + s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); + s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + + // stage 2 + s0 = (int)x0; + s1 = (int)x1; + s2 = (int)x2; + s3 = (int)x3; + s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); + s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); + s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); + s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + + // stage 3 + s2 = (int)(cospi_16_64 * (x2 + x3)); + s3 = (int)(cospi_16_64 * (x2 - x3)); + s6 = (int)(cospi_16_64 * (x6 + x7)); + s7 = (int)(cospi_16_64 * (x6 - x7)); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); +} + +void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + aom_idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + aom_idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void aom_idct16_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0 / 2]; + step1[1] = input[16 / 2]; + step1[2] = input[8 / 2]; + step1[3] = input[24 / 2]; + step1[4] = input[4 / 2]; + step1[5] = input[20 / 2]; + step1[6] = input[12 / 2]; + step1[7] = input[28 / 2]; + step1[8] = input[2 / 2]; + step1[9] = input[18 / 2]; + step1[10] = input[10 / 2]; + step1[11] = input[26 / 2]; + step1[12] = input[6 / 2]; + step1[13] = input[22 / 2]; + step1[14] = input[14 / 2]; + step1[15] = input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = WRAPLOW(step2[0] + step2[15]); + output[1] = WRAPLOW(step2[1] + step2[14]); + output[2] = WRAPLOW(step2[2] + step2[13]); + output[3] = WRAPLOW(step2[3] + step2[12]); + output[4] = WRAPLOW(step2[4] + step2[11]); + output[5] = WRAPLOW(step2[5] + step2[10]); + output[6] = WRAPLOW(step2[6] + step2[9]); + output[7] = WRAPLOW(step2[7] + step2[8]); + output[8] = WRAPLOW(step2[7] - step2[8]); + output[9] = WRAPLOW(step2[6] - step2[9]); + output[10] = WRAPLOW(step2[5] - step2[10]); + output[11] = WRAPLOW(step2[4] - step2[11]); + output[12] = WRAPLOW(step2[3] - step2[12]); + output[13] = WRAPLOW(step2[2] - step2[13]); + output[14] = WRAPLOW(step2[1] - step2[14]); + output[15] = WRAPLOW(step2[0] - step2[15]); +} + +void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + aom_idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + aom_idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); +} + +void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + aom_idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + aom_idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + aom_idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + aom_idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + if (a1 == 0) return; + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void aom_idct32_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = WRAPLOW(dct_const_round_shift(temp1)); + step1[31] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + step2[16] = WRAPLOW(step1[16] + step1[17]); + step2[17] = WRAPLOW(step1[16] - step1[17]); + step2[18] = WRAPLOW(-step1[18] + step1[19]); + step2[19] = WRAPLOW(step1[18] + step1[19]); + step2[20] = WRAPLOW(step1[20] + step1[21]); + step2[21] = WRAPLOW(step1[20] - step1[21]); + step2[22] = WRAPLOW(-step1[22] + step1[23]); + step2[23] = WRAPLOW(step1[22] + step1[23]); + step2[24] = WRAPLOW(step1[24] + step1[25]); + step2[25] = WRAPLOW(step1[24] - step1[25]); + step2[26] = WRAPLOW(-step1[26] + step1[27]); + step2[27] = WRAPLOW(step1[26] + step1[27]); + step2[28] = WRAPLOW(step1[28] + step1[29]); + step2[29] = WRAPLOW(step1[28] - step1[29]); + step2[30] = WRAPLOW(-step1[30] + step1[31]); + step2[31] = WRAPLOW(step1[30] + step1[31]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = WRAPLOW(step1[16] + step1[19]); + step2[17] = WRAPLOW(step1[17] + step1[18]); + step2[18] = WRAPLOW(step1[17] - step1[18]); + step2[19] = WRAPLOW(step1[16] - step1[19]); + step2[20] = WRAPLOW(-step1[20] + step1[23]); + step2[21] = WRAPLOW(-step1[21] + step1[22]); + step2[22] = WRAPLOW(step1[21] + step1[22]); + step2[23] = WRAPLOW(step1[20] + step1[23]); + + step2[24] = WRAPLOW(step1[24] + step1[27]); + step2[25] = WRAPLOW(step1[25] + step1[26]); + step2[26] = WRAPLOW(step1[25] - step1[26]); + step2[27] = WRAPLOW(step1[24] - step1[27]); + step2[28] = WRAPLOW(-step1[28] + step1[31]); + step2[29] = WRAPLOW(-step1[29] + step1[30]); + step2[30] = WRAPLOW(step1[29] + step1[30]); + step2[31] = WRAPLOW(step1[28] + step1[31]); + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = WRAPLOW(step1[16] + step1[23]); + step2[17] = WRAPLOW(step1[17] + step1[22]); + step2[18] = WRAPLOW(step1[18] + step1[21]); + step2[19] = WRAPLOW(step1[19] + step1[20]); + step2[20] = WRAPLOW(step1[19] - step1[20]); + step2[21] = WRAPLOW(step1[18] - step1[21]); + step2[22] = WRAPLOW(step1[17] - step1[22]); + step2[23] = WRAPLOW(step1[16] - step1[23]); + + step2[24] = WRAPLOW(-step1[24] + step1[31]); + step2[25] = WRAPLOW(-step1[25] + step1[30]); + step2[26] = WRAPLOW(-step1[26] + step1[29]); + step2[27] = WRAPLOW(-step1[27] + step1[28]); + step2[28] = WRAPLOW(step1[27] + step1[28]); + step2[29] = WRAPLOW(step1[26] + step1[29]); + step2[30] = WRAPLOW(step1[25] + step1[30]); + step2[31] = WRAPLOW(step1[24] + step1[31]); + + // stage 7 + step1[0] = WRAPLOW(step2[0] + step2[15]); + step1[1] = WRAPLOW(step2[1] + step2[14]); + step1[2] = WRAPLOW(step2[2] + step2[13]); + step1[3] = WRAPLOW(step2[3] + step2[12]); + step1[4] = WRAPLOW(step2[4] + step2[11]); + step1[5] = WRAPLOW(step2[5] + step2[10]); + step1[6] = WRAPLOW(step2[6] + step2[9]); + step1[7] = WRAPLOW(step2[7] + step2[8]); + step1[8] = WRAPLOW(step2[7] - step2[8]); + step1[9] = WRAPLOW(step2[6] - step2[9]); + step1[10] = WRAPLOW(step2[5] - step2[10]); + step1[11] = WRAPLOW(step2[4] - step2[11]); + step1[12] = WRAPLOW(step2[3] - step2[12]); + step1[13] = WRAPLOW(step2[2] - step2[13]); + step1[14] = WRAPLOW(step2[1] - step2[14]); + step1[15] = WRAPLOW(step2[0] - step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WRAPLOW(step1[0] + step1[31]); + output[1] = WRAPLOW(step1[1] + step1[30]); + output[2] = WRAPLOW(step1[2] + step1[29]); + output[3] = WRAPLOW(step1[3] + step1[28]); + output[4] = WRAPLOW(step1[4] + step1[27]); + output[5] = WRAPLOW(step1[5] + step1[26]); + output[6] = WRAPLOW(step1[6] + step1[25]); + output[7] = WRAPLOW(step1[7] + step1[24]); + output[8] = WRAPLOW(step1[8] + step1[23]); + output[9] = WRAPLOW(step1[9] + step1[22]); + output[10] = WRAPLOW(step1[10] + step1[21]); + output[11] = WRAPLOW(step1[11] + step1[20]); + output[12] = WRAPLOW(step1[12] + step1[19]); + output[13] = WRAPLOW(step1[13] + step1[18]); + output[14] = WRAPLOW(step1[14] + step1[17]); + output[15] = WRAPLOW(step1[15] + step1[16]); + output[16] = WRAPLOW(step1[15] - step1[16]); + output[17] = WRAPLOW(step1[14] - step1[17]); + output[18] = WRAPLOW(step1[13] - step1[18]); + output[19] = WRAPLOW(step1[12] - step1[19]); + output[20] = WRAPLOW(step1[11] - step1[20]); + output[21] = WRAPLOW(step1[10] - step1[21]); + output[22] = WRAPLOW(step1[9] - step1[22]); + output[23] = WRAPLOW(step1[8] - step1[23]); + output[24] = WRAPLOW(step1[7] - step1[24]); + output[25] = WRAPLOW(step1[6] - step1[25]); + output[26] = WRAPLOW(step1[5] - step1[26]); + output[27] = WRAPLOW(step1[4] - step1[27]); + output[28] = WRAPLOW(step1[3] - step1[28]); + output[29] = WRAPLOW(step1[2] - step1[29]); + output[30] = WRAPLOW(step1[1] - step1[30]); + output[31] = WRAPLOW(step1[0] - step1[31]); +} + +void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + int16_t zero_coeff[16]; + for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + aom_idct32_c(input, outptr); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + aom_idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + aom_idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + if (a1 == 0) return; + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = HIGHBD_WRAPLOW(b1, bd); + op[2] = HIGHBD_WRAPLOW(c1, bd); + op[3] = HIGHBD_WRAPLOW(d1, bd); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = + highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); + dest[stride * 1] = + highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); + dest[stride * 2] = + highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); + dest[stride * 3] = + highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); + + ip++; + dest++; + } +} + +void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void)bd; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = + highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = + highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = + highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = + highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void)bd; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 + output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); + output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); + output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); + output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); +} + +void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 4; ++i) { + aom_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + aom_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1; + tran_low_t out = + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); + dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); + dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); + dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); + dest += dest_stride; + } +} + +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h new file mode 100644 index 000000000..e64d463ea --- /dev/null +++ b/third_party/aom/aom_dsp/inv_txfm.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_INV_TXFM_H_ +#define AOM_DSP_INV_TXFM_H_ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return rv; +} + +static INLINE tran_high_t check_range(tran_high_t input, int bd) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid AV1 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt AV1 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + // For valid highbitdepth AV1 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + const int32_t int_max = (1 << (7 + bd)) - 1; + const int32_t int_min = -int_max - 1; + assert(int_min <= input); + assert(input <= int_max); + (void)int_min; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + (void)bd; + return input; +} + +#define WRAPLOW(x) ((int32_t)check_range(x, 8)) +#if CONFIG_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd)) +#endif // CONFIG_HIGHBITDEPTH + +void aom_idct4_c(const tran_low_t *input, tran_low_t *output); +void aom_idct8_c(const tran_low_t *input, tran_low_t *output); +void aom_idct16_c(const tran_low_t *input, tran_low_t *output); +void aom_idct32_c(const tran_low_t *input, tran_low_t *output); +void aom_iadst4_c(const tran_low_t *input, tran_low_t *output); +void aom_iadst8_c(const tran_low_t *input, tran_low_t *output); +void aom_iadst16_c(const tran_low_t *input, tran_low_t *output); + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd); + +void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + trans = HIGHBD_WRAPLOW(trans, bd); + return clip_pixel_highbd(dest + (int)trans, bd); +} +#endif + +static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { + trans = WRAPLOW(trans); + return clip_pixel(dest + (int)trans); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_INV_TXFM_H_ diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c new file mode 100644 index 000000000..e2e839219 --- /dev/null +++ b/third_party/aom/aom_dsp/loopfilter.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +static INLINE int8_t signed_char_clamp(int t) { + return (int8_t)clamp(t, -128, 127); +} + +#define PARALLEL_DEBLOCKING_11_TAP 0 +#define PARALLEL_DEBLOCKING_9_TAP 0 + +#if CONFIG_HIGHBITDEPTH +static INLINE int16_t signed_char_clamp_high(int t, int bd) { + switch (bd) { + case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); + case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); + case 8: + default: return (int16_t)clamp(t, -128, 128 - 1); + } +} +#endif +#if CONFIG_PARALLEL_DEBLOCKING +// should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} +#endif // CONFIG_PARALLEL_DEBLOCKING +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + mask |= (abs(p3 - p0) > thresh) * -1; + mask |= (abs(q3 - q0) > thresh) * -1; + return ~mask; +} + +#if PARALLEL_DEBLOCKING_9_TAP +static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0, + uint8_t q0, uint8_t q4) { + int8_t mask = 0; + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + return ~mask; +} +#endif + +#if PARALLEL_DEBLOCKING_11_TAP +static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4, + uint8_t p0, uint8_t q0, uint8_t q4, + uint8_t q5) { + int8_t mask = 0; + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + mask |= (abs(p5 - p0) > thresh) * -1; + mask |= (abs(q5 - q0) > thresh) * -1; + return ~mask; +} +#endif + +static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3, + uint8_t q4) { + int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + return ~mask; +} + +// is there high edge variance internal edge: 11111111 yes, 00000000 no +static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + int8_t hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t)*op1 ^ 0x80; + const int8_t ps0 = (int8_t)*op0 ^ 0x80; + const int8_t qs0 = (int8_t)*oq0 ^ 0x80; + const int8_t qs1 = (int8_t)*oq1 ^ 0x80; + const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; + *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + + // outer tap adjustments + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; + *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; +} + +void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint8_t p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); +#endif // !CONFIG_PARALLEL_DEBLOCKING + filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } +} + +void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint8_t p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); +#endif // !CONFIG_PARALLEL_DEBLOCKING + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); + s += pitch; + } +} + +void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + if (flat && mask) { + const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p); + ++s; + } +} + +void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, + s + 3); + s += pitch; + } +} + +void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +#if PARALLEL_DEBLOCKING_11_TAP +static INLINE void filter12(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint8_t *op5, uint8_t *op4, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3, uint8_t *oq4, + uint8_t *oq5) { + if (flat2 && flat && mask) { + const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, + p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5; + + // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1] + *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12; + *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12; + *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12; + *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12; + *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12; + *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12; + *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12; + *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12; + *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12; + *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12; + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} +#endif + +#if PARALLEL_DEBLOCKING_9_TAP +static INLINE void filter10(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint8_t *op4, uint8_t *op3, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, + uint8_t *oq3, uint8_t *oq4) { + if (flat2 && flat && mask) { + const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4; + + // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1] + *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10; + *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10; + *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10; + *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10; + *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10; + *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10; + *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10; + *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10; + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} +#endif + +static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint8_t *op7, uint8_t *op6, + uint8_t *op5, uint8_t *op4, uint8_t *op3, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, + uint8_t *oq3, uint8_t *oq4, uint8_t *oq5, + uint8_t *oq6, uint8_t *oq7) { + if (flat2 && flat && mask) { + const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, + p2 = *op2, p1 = *op1, p0 = *op0; + + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6, q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} + +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p], + p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], + p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], + q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + +#if PARALLEL_DEBLOCKING_11_TAP + const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); + + filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p, + s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, + s + 3 * p, s + 4 * p, s + 5 * p); + +#elif PARALLEL_DEBLOCKING_9_TAP + const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4); + + filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p, + s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, + s + 4 * p); +#else + const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); + + filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, + s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, + s + 7 * p); +#endif + + ++s; + } +} + +void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +} + +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + int i; + + for (i = 0; i < count; ++i) { + const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], + p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], + q5 = s[5], q6 = s[6], q7 = s[7]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + +#if PARALLEL_DEBLOCKING_11_TAP + const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); + + filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2, + s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5); +#elif PARALLEL_DEBLOCKING_9_TAP + const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4); + + filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s, + s + 1, s + 2, s + 3, s + 4); + +#else + const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); + + filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, + s + 7); +#endif + + s += p; + } +} + +void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); +} + +void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); +} + +#if CONFIG_HIGHBITDEPTH +#if CONFIG_PARALLEL_DEBLOCKING +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} +#endif // CONFIG_PARALLEL_DEBLOCKING + +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, + uint16_t p3, uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, uint16_t q1, + uint16_t q2, uint16_t q3, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p3 - p2) > limit16) * -1; + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(q3 - q2) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, uint16_t q3, + int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + mask |= (abs(p3 - p0) > thresh16) * -1; + mask |= (abs(q3 - q0) > thresh16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3, + uint16_t p2, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, uint16_t q2, + uint16_t q3, uint16_t q4, int bd) { + int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p4 - p0) > thresh16) * -1; + mask |= (abs(q4 - q0) > thresh16) * -1; + return ~mask; +} + +// Is there high edge variance internal edge: +// 11111111_11111111 yes, 00000000_00000000 no ? +static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, int bd) { + int16_t hev = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + hev |= (abs(p1 - p0) > thresh16) * -1; + hev |= (abs(q1 - q0) > thresh16) * -1; + return hev; +} + +static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + int bd) { + int16_t filter1, filter2; + // ^0x80 equivalent to subtracting 0x80 from the values to turn them + // into -128 to +127 instead of 0 to 255. + int shift = bd - 8; + const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); + const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); + const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); + const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); + const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); + + // Add outer taps if we have high edge variance. + int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; + + // Inner taps. + filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; + + // Save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way. + filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; + filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; + + *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); + *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); + + // Outer tap adjustments. + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); + *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); +} + +void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); +#endif // !CONFIG_PARALLEL_DEBLOCKING + highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_4_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint16_t p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); +#endif // !CONFIG_PARALLEL_DEBLOCKING + highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_4_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, + uint16_t *op3, uint16_t *op2, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + uint16_t *oq2, uint16_t *oq3, int bd) { + if (flat && mask) { + const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, + s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_8_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, + s + 2, s + 3, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_8_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint16_t *op7, uint16_t *op6, + uint16_t *op5, uint16_t *op4, uint16_t *op3, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + uint16_t *oq3, uint16_t *oq4, uint16_t *oq5, + uint16_t *oq6, uint16_t *oq7, int bd) { + if (flat2 && flat && mask) { + const uint16_t p7 = *op7; + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + const uint16_t q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} + +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = + highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, + s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, + s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, + s + 6 * p, s + 7 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +} + +void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); +} + +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4]; + const uint16_t p2 = s[-3]; + const uint16_t p1 = s[-2]; + const uint16_t p0 = s[-1]; + const uint16_t q0 = s[0]; + const uint16_t q1 = s[1]; + const uint16_t q2 = s[2]; + const uint16_t q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, + s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, + s + 5, s + 6, s + 7, bd); + s += p; + } +} + +void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); +} + +void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c new file mode 100644 index 000000000..4c6e201e1 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./macros_msa.h" + +void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, + char blackclamp[16], char whiteclamp[16], + char bothclamp[16], uint32_t width, + uint32_t height, int32_t pitch) { + uint32_t i, j; + + for (i = 0; i < height / 2; ++i) { + uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; + int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); + uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; + int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); + for (j = width / 16; j--;) { + v16i8 temp00_s, temp01_s; + v16u8 temp00, temp01, black_clamp, white_clamp; + v16u8 pos0, ref0, pos1, ref1; + v16i8 const127 = __msa_ldi_b(127); + + pos0 = LD_UB(pos0_ptr); + ref0 = LD_UB(ref0_ptr); + pos1 = LD_UB(pos1_ptr); + ref1 = LD_UB(ref1_ptr); + black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); + white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); + temp00 = (pos0 < black_clamp); + pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); + temp01 = (pos1 < black_clamp); + pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); + XORI_B2_128_UB(pos0, pos1); + temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp00 = (v16u8)(temp00_s < pos0); + pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); + temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp01 = (temp01_s < pos1); + pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); + XORI_B2_128_UB(pos0, pos1); + pos0 += ref0; + ST_UB(pos0, pos0_ptr); + pos1 += ref1; + ST_UB(pos1, pos1_ptr); + pos0_ptr += 16; + pos1_ptr += 16; + ref0_ptr += 16; + ref1_ptr += 16; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c new file mode 100644 index 000000000..847394a3d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c @@ -0,0 +1,704 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 mask0, mask1, mask2, mask3; + v8i16 filt, res0, res1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, res0, res1); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + XORI_B2_128_UB(res2, res3); + AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filt, vec0, vec1, vec2, vec3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec0, vec1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec2, vec3); + SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); + SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); + PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, + res3); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + XORI_B2_128_UB(res0, res2); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); + AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + src += (2 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + LD_UB2(dst, dst_stride, dst0, dst1); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); + dst += dst_stride; + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(dst, 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + for (cnt = 0; cnt < 2; ++cnt) { + src0 = LD_SB(&src[cnt << 5]); + src2 = LD_SB(&src[16 + (cnt << 5)]); + src3 = LD_SB(&src[24 + (cnt << 5)]); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, + vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(&dst[cnt << 5], 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); + } + + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); +} + +static void common_hz_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + } +} + +static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB2(dst, 16, dst0, dst1); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); + dst += dst_stride; + LD_UB2(dst, 16, dst2, dst3); + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src, 16, src0, src2, src4, src6); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + LD_UB4(dst, 16, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(out1, out0, dst0, dst); + PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); + PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); + PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); + dst += dst_stride; + } +} + +void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 32: + common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c new file mode 100644 index 000000000..bed600d5b --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hv_8ht_8vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); + XORI_B2_128_UB(tmp0, tmp1); + AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + vec0 = vec2; + vec1 = vec3; + vec2 = vec4; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 dst0, dst1, dst2, dst3, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, + tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, + res3); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0 && + ((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else if (((const int32_t *)filter_x)[0] == 0 || + ((const int32_t *)filter_y)[0] == 0) { + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c new file mode 100644 index 000000000..dae771104 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, out; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1, + filt2, filt3); + out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1, + filt2, filt3); + out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1, + filt2, filt3); + out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + src_tmp += (4 * src_stride); + + LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1, + dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + src4 = LD_SB(src); + src += src_stride; + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16u8 src2110, src4332, src6554, src8776, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, + dst3); + ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); +} + +static void common_vt_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, + dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, + dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB2(src, 16, src0, src5); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5; + v16u8 src6, src7, src8, src9, src10, src11, filt0; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(dst + 16, dst_stride, dst2, dst3); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(dst + 32, dst_stride, dst4, dst5); + LD_UB2(src + 48, src_stride, src10, src11); + LD_UB2(dst + 48, dst_stride, dst6, dst7); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c new file mode 100644 index 000000000..fc3a823c5 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + SRARI_H2_SH(out0, out1, FILTER_BITS); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1, out2, + out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); +} + +static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height >> 1; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + PCKEV_ST_SB(out6, out7, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src4 = LD_SB(src + 32); + src6 = LD_SB(src + 48); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + PCKEV_ST_SB(out4, out5, dst + 32); + PCKEV_ST_SB(out6, out7, dst + 48); + dst += dst_stride; + } +} + +void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 16: + common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 32: + common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 64: + common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c new file mode 100644 index 000000000..a4d594931 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c @@ -0,0 +1,630 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + out0 = out2; + out1 = out3; + out2 = out4; + } +} + +static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, + vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int32_t x_step_q4, const int16_t *filter_y, + int32_t y_step_q4, int32_t w, int32_t h) { + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0 && + ((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + break; + } + } else if (((const int32_t *)filter_x)[0] == 0 || + ((const int32_t *)filter_y)[0] == 0) { + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c new file mode 100644 index 000000000..f7bdfc2bd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src_tmp += (4 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + src += src_stride; + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src5 = LD_UB(src + 16); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(src + 48, src_stride, src10, src11); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 48); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c new file mode 100644 index 000000000..75f8c7ea8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/macros_msa.h" + +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + out2 = __msa_copy_u_w((v4i32)dst2, 0); + out3 = __msa_copy_u_w((v4i32)dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_d((v2i64)dst0, 0); + out1 = __msa_copy_u_d((v2i64)dst1, 0); + out2 = __msa_copy_u_d((v2i64)dst2, 0); + out3 = __msa_copy_u_d((v2i64)dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void avg_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 8); cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); + LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); + dst_dup += (4 * dst_stride); + LD_UB4(src, src_stride, src8, src10, src12, src14); + LD_UB4(src + 16, src_stride, src9, src11, src13, src15); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); + LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); + dst_dup += (4 * dst_stride); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); + dst += (4 * dst_stride); + ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); + ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(src, 16, src4, src5, src6, src7); + src += src_stride; + LD_UB4(src, 16, src8, src9, src10, src11); + src += src_stride; + LD_UB4(src, 16, src12, src13, src14, src15); + src += src_stride; + + LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); + dst_dup += dst_stride; + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += dst_stride; + ST_UB4(dst4, dst5, dst6, dst7, dst, 16); + dst += dst_stride; + ST_UB4(dst8, dst9, dst10, dst11, dst, 16); + dst += dst_stride; + ST_UB4(dst12, dst13, dst14, dst15, dst, 16); + dst += dst_stride; + } +} + +void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 4: { + avg_width4_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 8: { + avg_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c new file mode 100644 index 000000000..f7f116f4d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/mips/macros_msa.h" + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, + dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 4: { + uint32_t cnt, tmp; + /* 1 word storage */ + for (cnt = h; cnt--;) { + tmp = LW(src); + SW(tmp, dst); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h new file mode 100644 index 000000000..1a0ae4d8d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#define AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/aom_filter.h" + +extern const uint8_t mc_filt_mask_arr[16 * 3]; + +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ + }) + +#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ + filt_h1, filt_h2, filt_h3) \ + ({ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ + vec3_m); \ + hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ + filt_h1, filt_h2, filt_h3); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ + }) + +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ + } + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ + } + +#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = PCKEV_XORI128_UB(in1, in0); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ + stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ + } +#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/avg_msa.c b/third_party/aom/aom_dsp/mips/avg_msa.c new file mode 100644 index 000000000..0e1728155 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/avg_msa.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +uint32_t aom_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + v4u32 sum = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); + HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); + ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); + ADD2(sum0, sum2, sum4, sum6, sum0, sum4); + sum0 += sum4; + + sum = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); + sum = __msa_hadd_u_w(sum0, sum0); + sum = (v4u32)__msa_srari_w((v4i32)sum, 6); + sum_out = __msa_copy_u_w((v4i32)sum, 0); + + return sum_out; +} + +uint32_t aom_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + uint32_t src0, src1, src2, src3; + v16u8 vec = { 0 }; + v8u16 sum0; + v4u32 sum1; + v2u64 sum2; + + LW4(src, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, vec); + + sum0 = __msa_hadd_u_h(vec, vec); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum2 = __msa_hadd_u_d(sum1, sum1); + sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); + sum_out = __msa_copy_u_w((v4i32)sum1, 0); + + return sum_out; +} diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c new file mode 100644 index 000000000..00ab75dc3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/common_dspr2.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; +uint8_t *aom_ff_cropTbl; + +void aom_dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) { + aom_ff_cropTbl_a[i] = 0; + aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; + } + + aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH]; +} + +#endif diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h new file mode 100644 index 000000000..31159fdcd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/common_dspr2.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_MIPS_DSPR2_H_ +#define AOM_COMMON_MIPS_DSPR2_H_ + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if HAVE_DSPR2 +#define CROP_WIDTH 512 + +extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c" + +static INLINE void prefetch_load(const unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store(unsigned char *dst) { + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} + +static INLINE void prefetch_load_streamed(const unsigned char *src) { + __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store_streamed(unsigned char *dst) { + __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_MIPS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c new file mode 100644 index 000000000..d557115b9 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, + w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c new file mode 100644 index 000000000..efbdcf60f --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3; + uint32_t tn1, tn2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3, tp4; + uint32_t p1, p2, p3, p4, n1; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tp4], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tp4], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tp3], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp4], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tp1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" + + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" + + /* store bytes */ + "sb %[tp3], 3(%[dst]) \n\t" + "sb %[tp4], 5(%[dst]) \n\t" + "sb %[tp1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c new file mode 100644 index 000000000..066308315 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp2](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_8_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[Temp1], %[p3](%[cm]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[Temp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), + [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +static void convolve_bi_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + sum += src[x] * filter[3]; + sum += src[x + 1] * filter[4]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 8: + convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 16: + case 32: + convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + default: + convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, + h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c new file mode 100644 index 000000000..dc51ab1cb --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[p1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[p2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[p1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + prefetch_load((const uint8_t *)filter_x); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c new file mode 100644 index 000000000..3367be01a --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c new file mode 100644 index 000000000..298065adb --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} + +void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + + assert(w <= 64); + assert(h <= 64); + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + if (intermediate_height < h) intermediate_height = h; + + aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, intermediate_height); + + aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { + int x, y; + uint32_t tp1, tp2, tn1; + uint32_t tp3, tp4, tn2; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + + : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 8: + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 16: + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 32: + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 32(%[dst]) \n\t" + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "ulw %[tp3], 36(%[src]) \n\t" + "ulw %[tp4], 36(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 40(%[src]) \n\t" + "ulw %[tp2], 40(%[dst]) \n\t" + "sw %[tn1], 32(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 36(%[dst]) \n\t" /* store */ + "ulw %[tp3], 44(%[src]) \n\t" + "ulw %[tp4], 44(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 48(%[src]) \n\t" + "ulw %[tp2], 48(%[dst]) \n\t" + "sw %[tn1], 40(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 44(%[dst]) \n\t" /* store */ + "ulw %[tp3], 52(%[src]) \n\t" + "ulw %[tp4], 52(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 56(%[src]) \n\t" + "ulw %[tp2], 56(%[dst]) \n\t" + "sw %[tn1], 48(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 52(%[dst]) \n\t" /* store */ + "ulw %[tp3], 60(%[src]) \n\t" + "ulw %[tp4], 60(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 56(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + default: + for (y = h; y > 0; --y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c new file mode 100644 index 000000000..f6534b420 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -0,0 +1,998 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tn3], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tn3], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tn2], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn3], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tn1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" + + "lbux %[n1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" + + /* store bytes */ + "sb %[tn2], 3(%[dst]) \n\t" + "sb %[tn3], 5(%[dst]) \n\t" + "sb %[tn1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, + h); + break; + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c new file mode 100644 index 000000000..c871702f4 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c @@ -0,0 +1,1590 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tn1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4, n1; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp2], 0(%[src]) \n\t" + "ulw %[tp1], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp1] \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tp3] \n\t" + "preceu.ph.qbl %[n1], %[tp3] \n\t" + "ulw %[tp2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tp2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "lbux %[tp3], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp3], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "ulw %[tp1], 1(%[src]) \n\t" + "ulw %[tp3], 5(%[src]) \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[tp2], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "ulw %[tp2], 9(%[src]) \n\t" + + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[n1], %[tp2] \n\t" + "ulw %[Temp1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[Temp1] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[n1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +static void convolve_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y, k; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x * dst_stride] = src[x]; + } + + src += src_stride; + dst += 1; + } +} + +void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + uint32_t pos = 38; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (intermediate_height < h) intermediate_height = h; + + /* copy the src to dst */ + if (filter_x[3] == 0x80) { + copy_horiz_transposed(src - src_stride * 3, src_stride, temp, + intermediate_height, w, intermediate_height); + } else if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp, + intermediate_height, filter_x, w, intermediate_height); + } else { + src -= (src_stride * 3 + 3); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_horiz_4_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 8: + convolve_horiz_8_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_horiz_64_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + default: + convolve_horiz_transposed(src, src_stride, temp, intermediate_height, + filter_x, w, intermediate_height); + break; + } + } + + /* copy the src to dst */ + if (filter_y[3] == 0x80) { + copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); + } else if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, + filter_y, h, w); + } else { + switch (h) { + case 4: + convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 8: + convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w, (h / 16)); + break; + case 64: + convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + default: + convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, + filter_y, h, w); + break; + } + } +} + +void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: { + uint32_t tp1; + + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], (%[src]) \n\t" + "sw %[tp1], (%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 8: { + uint32_t tp1, tp2; + + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 16: { + uint32_t tp1, tp2, tp3, tp4; + + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 32: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 64: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 36(%[src]) \n\t" + "ulw %[tp3], 40(%[src]) \n\t" + "ulw %[tp4], 44(%[src]) \n\t" + "ulw %[tp5], 48(%[src]) \n\t" + "ulw %[tp6], 52(%[src]) \n\t" + "ulw %[tp7], 56(%[src]) \n\t" + "ulw %[tp8], 60(%[src]) \n\t" + + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + default: + for (y = h; y--;) { + for (x = 0; x < w; ++x) { + dst[x] = src[x]; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c new file mode 100644 index 000000000..c60557617 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[tn1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[n2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[n1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + prefetch_load((const uint8_t *)filter_x); + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c new file mode 100644 index 000000000..d8a90b6ab --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} + +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h new file mode 100644 index 000000000..f8fd9e2b6 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ +#define AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h); + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c new file mode 100644 index 000000000..dc9c63226 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/fwd_txfm_msa.h" + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 step0, step1, step2, step3; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + v8i16 step0_1, step1_1, step2_1, step3_1; + + /* 1st and 2nd set */ + LD_SH4(input, src_stride, in0, in1, in2, in3); + LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff, 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); + + /* 3rd and 4th set */ + LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); + LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 temp0, temp1; + + /* fdct even */ + LD_SH4(input, 8, in0, in1, in2, in3); + LD_SH4(input + 96, 8, in12, in13, in14, in15); + BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, + vec3, in12, in13, in14, in15); + LD_SH4(input + 32, 8, in4, in5, in6, in7); + LD_SH4(input + 64, 8, in8, in9, in10, in11); + BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, + in8, in9, in10, in11); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp); + ST_SH(temp1, temp + 512); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 256); + ST_SH(temp1, temp + 768); + + SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 128); + ST_SH(temp1, temp + 896); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 640); + ST_SH(temp1, temp + 384); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 64); + ST_SH(temp1, temp + 960); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 576); + ST_SH(temp1, temp + 448); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 320); + ST_SH(temp1, temp + 704); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 192); + ST_SH(temp1, temp + 832); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(input + 32); + in21 = LD_SH(input + 40); + in26 = LD_SH(input + 80); + in27 = LD_SH(input + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(input + 16); + in19 = LD_SH(input + 24); + in28 = LD_SH(input + 96); + in29 = LD_SH(input + 104); + + vec4 = in19 - in20; + ST_SH(vec4, input + 32); + vec4 = in18 - in21; + ST_SH(vec4, input + 40); + vec4 = in29 - in26; + ST_SH(vec4, input + 80); + vec4 = in28 - in27; + ST_SH(vec4, input + 88); + + in21 = in18 + in21; + in20 = in19 + in20; + in27 = in28 + in27; + in26 = in29 + in26; + + LD_SH4(input + 48, 8, in22, in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(input); + in17 = LD_SH(input + 8); + in30 = LD_SH(input + 112); + in31 = LD_SH(input + 120); + + vec4 = in17 - in22; + ST_SH(vec4, input + 16); + vec4 = in16 - in23; + ST_SH(vec4, input + 24); + vec4 = in31 - in24; + ST_SH(vec4, input + 96); + vec4 = in30 - in25; + ST_SH(vec4, input + 104); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr); + ST_SH(vec4, temp_ptr + 960); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 448); + ST_SH(vec4, temp_ptr + 512); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 704); + ST_SH(vec5, temp_ptr + 256); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 192); + ST_SH(vec5, temp_ptr + 768); + + LD_SH4(input + 16, 8, in22, in23, in20, in21); + LD_SH4(input + 80, 8, in26, in27, in24, in25); + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 832); + ST_SH(vec4, temp_ptr + 128); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 320); + ST_SH(vec4, temp_ptr + 640); + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 576); + ST_SH(vec4, temp_ptr + 384); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 64); + ST_SH(vec4, temp_ptr + 896); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 step0, step1, step2, step3, step4, step5, step6, step7; + + LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); + + /* 2nd set */ + LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, + (output + 8 * 8), 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, + tmp1_w, tmp2_w, tmp3_w); + BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); + ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, + vec1_r, vec2_r, vec3_r); + + tmp3_w = vec0_r + vec3_r; + vec0_r = vec0_r - vec3_r; + vec3_r = vec1_r + vec2_r; + vec1_r = vec1_r - vec2_r; + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out, 8); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out + 16, 8); + + LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 32); + ST_SH(in5, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 40); + ST_SH(in5, out + 48); + + LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 64); + ST_SH(in5, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 72); + ST_SH(in5, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 80); + ST_SH(in5, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 96); + ST_SH(in5, out + 88); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = LD_SH(temp); + in4 = LD_SH(temp + 32); + in2 = LD_SH(temp + 64); + in6 = LD_SH(temp + 96); + in1 = LD_SH(temp + 128); + in7 = LD_SH(temp + 152); + in3 = LD_SH(temp + 192); + in5 = LD_SH(temp + 216); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = LD_SH(temp + 16); + in1_1 = LD_SH(temp + 232); + in2_1 = LD_SH(temp + 80); + in3_1 = LD_SH(temp + 168); + in4_1 = LD_SH(temp + 48); + in5_1 = LD_SH(temp + 176); + in6_1 = LD_SH(temp + 112); + in7_1 = LD_SH(temp + 240); + + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = LD_SH(temp + 8); + in1 = LD_SH(temp + 136); + in2 = LD_SH(temp + 72); + in3 = LD_SH(temp + 200); + in4 = LD_SH(temp + 40); + in5 = LD_SH(temp + 208); + in6 = LD_SH(temp + 104); + in7 = LD_SH(temp + 144); + + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, + 32); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); + + /* 4th set */ + in0_1 = LD_SH(temp + 24); + in1_1 = LD_SH(temp + 224); + in2_1 = LD_SH(temp + 88); + in3_1 = LD_SH(temp + 160); + in4_1 = LD_SH(temp + 56); + in5_1 = LD_SH(temp + 184); + in6_1 = LD_SH(temp + 120); + in7_1 = LD_SH(temp + 248); + + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, + 32); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void aom_fdct32x32_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + + temp0 = in0 + in3; + in0 = in0 - in3; + in3 = in1 + in2; + in1 = in1 - in2; + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31; + v8i16 vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = in28 + in29; + in19 = in31 + in30; + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} + +void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum = LD_HADD(input, stride); + sum += LD_HADD(input + 8, stride); + sum += LD_HADD(input + 16, stride); + sum += LD_HADD(input + 24, stride); + sum += LD_HADD(input + 32 * 8, stride); + sum += LD_HADD(input + 32 * 8 + 8, stride); + sum += LD_HADD(input + 32 * 8 + 16, stride); + sum += LD_HADD(input + 32 * 8 + 24, stride); + sum += LD_HADD(input + 32 * 16, stride); + sum += LD_HADD(input + 32 * 16 + 8, stride); + sum += LD_HADD(input + 32 * 16 + 16, stride); + sum += LD_HADD(input + 32 * 16 + 24, stride); + sum += LD_HADD(input + 32 * 24, stride); + sum += LD_HADD(input + 32 * 24 + 8, stride); + sum += LD_HADD(input + 32 * 24 + 16, stride); + sum += LD_HADD(input + 32 * 24 + 24, stride); + out[0] = (int16_t)(sum >> 3); +} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c new file mode 100644 index 000000000..f16d290c8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/fwd_txfm_msa.h" + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; + v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; + v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; + v8i16 coeff2 = { + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 + }; + + LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, + in10, in11, in12, in13, in14, in15); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in8, in9, in10, in11, 2); + SLLI_4V(in12, in13, in14, in15, 2); + ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); + ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); + SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); + SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); + ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); + + cnst4 = __msa_splati_h(coeff, 0); + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); + + cnst5 = __msa_splati_h(coeff, 1); + cnst5 = __msa_ilvev_h(cnst5, cnst4); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); + stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); + stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); + + /* stp2 */ + BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); + ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); + SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); + + cnst0 = __msa_splati_h(coeff, 4); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); + + BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + ILVRL_H2_SH(in15, in8, vec1, vec0); + SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr); + + cnst0 = __msa_splati_h(coeff2, 0); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 224); + + ILVRL_H2_SH(in14, in9, vec1, vec0); + SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 128); + + cnst1 = __msa_splati_h(coeff2, 2); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 96); + + SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + cnst1 = __msa_splati_h(coeff, 3); + cnst1 = __msa_ilvev_h(cnst0, cnst1); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + /* stp4 */ + ADD2(stp34, stp25, stp33, stp22, in13, in10); + + ILVRL_H2_SH(in13, in10, vec1, vec0); + SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 64); + + cnst0 = __msa_splati_h(coeff2, 1); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 160); + + SUB2(stp34, stp25, stp33, stp22, in12, in11); + ILVRL_H2_SH(in12, in11, vec1, vec0); + SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 192); + + cnst1 = __msa_splati_h(coeff2, 3); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 32); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + + LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); + ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); + SRA_4V(in0, in1, in2, in3, 2); + SRA_4V(in4, in5, in6, in7, 2); + SRA_4V(in8, in9, in10, in11, 2); + SRA_4V(in12, in13, in14, in15, 2); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); +} + +void aom_fdct4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 vec, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + vec = __msa_ceqi_h(in0, 0); + vec = vec ^ 255; + vec = mask & vec; + in0 += vec; + } + + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} + +void aom_fdct8x8_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} + +void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[0] = LD_HADD(input, stride); + out[1] = 0; +} + +void aom_fdct16x16_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} + +void aom_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum = LD_HADD(input, stride); + sum += LD_HADD(input + 8, stride); + sum += LD_HADD(input + 16 * 8, stride); + sum += LD_HADD(input + 16 * 8 + 8, stride); + out[0] = (int16_t)(sum >> 1); +} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h new file mode 100644 index 000000000..ada25dffd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_ +#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_ + +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_dsp/txfm_common.h" + +#define LD_HADD(psrc, stride) \ + ({ \ + v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ + v4i32 vec_w_m; \ + \ + LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ + ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ + LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ + ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ + in0_m, in4_m); \ + in0_m += in4_m; \ + \ + vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ + HADD_SW_S32(vec_w_m); \ + }) + +#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ + v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 coeff_m = { \ + cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ + }; \ + \ + BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ + cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ + vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ + cnst2_m = __msa_splati_h(coeff_m, 2); \ + cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ + vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ + vec7_m, out0, out2, out1, out3); \ + } + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ + SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ + AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ + in2, in3); \ + AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ + in6, in7); \ + } + +#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v8i16 x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + { \ + v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ + v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ + v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ + v8i16 coeff2_m = { \ + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ + }; \ + \ + /* stp 1 */ \ + ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ + ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __msa_splati_h(coeff_m, 0); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ + \ + cnst5_m = __msa_splati_h(coeff_m, 1); \ + cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ + stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ + stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ + \ + /* stp2 */ \ + BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ + stp33_m); \ + BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ + stp34_m); \ + \ + ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ + ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 4); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 3); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + /* stp4 */ \ + BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ + vec5_m); \ + BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ + stp31_m); \ + \ + ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + \ + out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 0); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 2); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 1); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 3); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + } + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clti_s_h(vec0, 0); \ + tp1_m = __msa_clti_s_h(vec1, 0); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one_m & tp0_m; \ + tp1_m = one_m & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define FDCT32_POSTPROC_NEG_W(vec) \ + { \ + v4i32 temp_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + temp_m = __msa_clti_s_w(vec, 0); \ + vec += 1; \ + temp_m = one_m & temp_m; \ + vec += temp_m; \ + vec >>= 2; \ + } + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clei_s_h(vec0, 0); \ + tp1_m = __msa_clei_s_h(vec1, 0); \ + tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ + tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one & tp0_m; \ + tp1_m = one & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ + v4i32 k0_m = __msa_fill_w((int32_t)const0); \ + \ + s0_m = __msa_fill_w((int32_t)const1); \ + k0_m = __msa_ilvev_w(s0_m, k0_m); \ + \ + ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ + ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ + ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ + ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ + \ + DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + \ + DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + } + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); +#endif // AOM_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c new file mode 100644 index 000000000..0ea127f52 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct16x16_msa.c @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8; + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1, + reg2, reg3, reg4, reg5, reg6, reg7); + TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8, + reg9, reg10, reg11, reg12, reg13, reg14, reg15); + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4, + reg8); + ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6, + reg10); + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0, + reg2, reg4, reg6, reg8, reg10, reg12, reg14); + ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3, + reg13, reg11, reg5, reg7, reg9, reg1, reg15); + ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); +} + +void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + /* load up 8x8 */ + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8 * 16; + /* load bottom 8x8 */ + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + + reg0 = reg2 - loc1; + reg2 = reg2 + loc1; + reg12 = reg14 - loc0; + reg14 = reg14 + loc0; + reg4 = reg6 - loc3; + reg6 = reg6 + loc3; + reg8 = reg10 - loc2; + reg10 = reg10 + loc2; + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); + dst += (4 * dst_stride); + SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); + dst += (4 * dst_stride); + SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); + dst += (4 * dst_stride); + SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); +} + +void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* process 16 * 8 block */ + aom_idct16_1d_rows_msa(input, out); + + /* short case just considers top 4 rows as valid output */ + out += 4 * 16; + for (i = 12; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out]) \n\t" + "sw $zero, 4(%[out]) \n\t" + "sw $zero, 8(%[out]) \n\t" + "sw $zero, 12(%[out]) \n\t" + "sw $zero, 16(%[out]) \n\t" + "sw $zero, 20(%[out]) \n\t" + "sw $zero, 24(%[out]) \n\t" + "sw $zero, 28(%[out]) \n\t" + + : + : [out] "r"(out)); + + out += 16; + } + + out = out_arr; + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + int16_t out; + v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 4; i--;) { + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6, + l7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11, + l12, l13, l14, l15); + + /* ADST in horizontal */ + AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, + l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, + r12, r13, r14, r15); + + l1 = -r8; + l3 = -r4; + l13 = -r13; + l15 = -r1; + + TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5, + l6, l7); + ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); + TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12, + l13, l14, l15); + ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); +} + +void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 v0, v2, v4, v6, k0, k1, k2, k3; + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v8i16 res8, res9, res10, res11, res12, res13, res14, res15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + v16i8 zero = { 0 }; + + r0 = LD_SH(input + 0 * 16); + r3 = LD_SH(input + 3 * 16); + r4 = LD_SH(input + 4 * 16); + r7 = LD_SH(input + 7 * 16); + r8 = LD_SH(input + 8 * 16); + r11 = LD_SH(input + 11 * 16); + r12 = LD_SH(input + 12 * 16); + r15 = LD_SH(input + 15 * 16); + + /* stage 1 */ + k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); + k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); + k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); + k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); + k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); + k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); + k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); + k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + r1 = LD_SH(input + 1 * 16); + r2 = LD_SH(input + 2 * 16); + r5 = LD_SH(input + 5 * 16); + r6 = LD_SH(input + 6 * 16); + r9 = LD_SH(input + 9 * 16); + r10 = LD_SH(input + 10 * 16); + r13 = LD_SH(input + 13 * 16); + r14 = LD_SH(input + 14 * 16); + + k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); + k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); + k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); + k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); + k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); + k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); + k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); + k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); + BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); + BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); + out1 = -out1; + SRARI_H2_SH(out0, out1, 6); + dst0 = LD_UB(dst + 0 * dst_stride); + dst1 = LD_UB(dst + 15 * dst_stride); + ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); + ADD2(res0, out0, res1, out1, res0, res1); + CLIP_SH2_0_255(res0, res1); + PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); + ST8x1_UB(res0, dst); + ST8x1_UB(res1, dst + 15 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + + SRARI_H2_SH(out8, out9, 6); + dst8 = LD_UB(dst + 1 * dst_stride); + dst9 = LD_UB(dst + 14 * dst_stride); + ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); + ADD2(res8, out8, res9, out9, res8, res9); + CLIP_SH2_0_255(res8, res9); + PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); + ST8x1_UB(res8, dst + dst_stride); + ST8x1_UB(res9, dst + 14 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); + MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + SRARI_H2_SH(out4, out5, 6); + dst4 = LD_UB(dst + 3 * dst_stride); + dst5 = LD_UB(dst + 12 * dst_stride); + ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); + ADD2(res4, out4, res5, out5, res4, res5); + CLIP_SH2_0_255(res4, res5); + PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); + ST8x1_UB(res4, dst + 3 * dst_stride); + ST8x1_UB(res5, dst + 12 * dst_stride); + + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + SRARI_H2_SH(out12, out13, 6); + dst12 = LD_UB(dst + 2 * dst_stride); + dst13 = LD_UB(dst + 13 * dst_stride); + ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); + ADD2(res12, out12, res13, out13, res12, res13); + CLIP_SH2_0_255(res12, res13); + PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); + ST8x1_UB(res12, dst + 2 * dst_stride); + ST8x1_UB(res13, dst + 13 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + MADD_SHORT(out6, out7, k0, k3, out6, out7); + SRARI_H2_SH(out6, out7, 6); + dst6 = LD_UB(dst + 4 * dst_stride); + dst7 = LD_UB(dst + 11 * dst_stride); + ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); + ADD2(res6, out6, res7, out7, res6, res7); + CLIP_SH2_0_255(res6, res7); + PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); + ST8x1_UB(res6, dst + 4 * dst_stride); + ST8x1_UB(res7, dst + 11 * dst_stride); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + SRARI_H2_SH(out10, out11, 6); + dst10 = LD_UB(dst + 6 * dst_stride); + dst11 = LD_UB(dst + 9 * dst_stride); + ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); + ADD2(res10, out10, res11, out11, res10, res11); + CLIP_SH2_0_255(res10, res11); + PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); + ST8x1_UB(res10, dst + 6 * dst_stride); + ST8x1_UB(res11, dst + 9 * dst_stride); + + k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); + k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + SRARI_H2_SH(out2, out3, 6); + dst2 = LD_UB(dst + 7 * dst_stride); + dst3 = LD_UB(dst + 8 * dst_stride); + ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); + ADD2(res2, out2, res3, out3, res2, res3); + CLIP_SH2_0_255(res2, res3); + PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); + ST8x1_UB(res2, dst + 7 * dst_stride); + ST8x1_UB(res3, dst + 8 * dst_stride); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + SRARI_H2_SH(out14, out15, 6); + dst14 = LD_UB(dst + 5 * dst_stride); + dst15 = LD_UB(dst + 10 * dst_stride); + ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); + ADD2(res14, out14, res15, out15, res14, res15); + CLIP_SH2_0_255(res14, res15); + PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); + ST8x1_UB(res14, dst + 5 * dst_stride); + ST8x1_UB(res15, dst + 10 * dst_stride); +} diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c new file mode 100644 index 000000000..f1ca757a0 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct32x32_msa.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); + + /* 3rd & 4th 8x8 */ + LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); + ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 15 * 8)); + ST_SH(loc1, (tmp_eve_buf)); + ST_SH(loc2, (tmp_eve_buf + 14 * 8)); + ST_SH(loc3, (tmp_eve_buf + 8)); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 13 * 8)); + ST_SH(loc1, (tmp_eve_buf + 2 * 8)); + ST_SH(loc2, (tmp_eve_buf + 12 * 8)); + ST_SH(loc3, (tmp_eve_buf + 3 * 8)); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 11 * 8)); + ST_SH(loc1, (tmp_eve_buf + 4 * 8)); + ST_SH(loc2, (tmp_eve_buf + 10 * 8)); + ST_SH(loc3, (tmp_eve_buf + 5 * 8)); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 9 * 8)); + ST_SH(loc1, (tmp_eve_buf + 6 * 8)); + ST_SH(loc2, (tmp_eve_buf + 8 * 8)); + ST_SH(loc3, (tmp_eve_buf + 7 * 8)); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 8); + reg1 = LD_SH(tmp_buf + 7 * 8); + reg2 = LD_SH(tmp_buf + 9 * 8); + reg3 = LD_SH(tmp_buf + 15 * 8); + reg4 = LD_SH(tmp_buf + 17 * 8); + reg5 = LD_SH(tmp_buf + 23 * 8); + reg6 = LD_SH(tmp_buf + 25 * 8); + reg7 = LD_SH(tmp_buf + 31 * 8); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf), 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 8); + reg1 = LD_SH(tmp_buf + 5 * 8); + reg2 = LD_SH(tmp_buf + 11 * 8); + reg3 = LD_SH(tmp_buf + 13 * 8); + reg4 = LD_SH(tmp_buf + 19 * 8); + reg5 = LD_SH(tmp_buf + 21 * 8); + reg6 = LD_SH(tmp_buf + 27 * 8); + reg7 = LD_SH(tmp_buf + 29 * 8); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH(reg0, (tmp_odd_buf + 13 * 8)); + ST_SH(reg1, (tmp_odd_buf + 14 * 8)); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + + ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + + ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + + ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + + ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 0), 32); + ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 8), 32); + ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); + + /* 3rd & 4th 8x8 */ + LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 16), 32); + ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 24), 32); + ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); +} + +static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + tmp_buf += (2 * 32); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, tmp_eve_buf, 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 32); + reg1 = LD_SH(tmp_buf + 7 * 32); + reg2 = LD_SH(tmp_buf + 9 * 32); + reg3 = LD_SH(tmp_buf + 15 * 32); + reg4 = LD_SH(tmp_buf + 17 * 32); + reg5 = LD_SH(tmp_buf + 23 * 32); + reg6 = LD_SH(tmp_buf + 25 * 32); + reg7 = LD_SH(tmp_buf + 31 * 32); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, tmp_odd_buf, 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 32); + reg1 = LD_SH(tmp_buf + 5 * 32); + reg2 = LD_SH(tmp_buf + 11 * 32); + reg3 = LD_SH(tmp_buf + 13 * 32); + reg4 = LD_SH(tmp_buf + 19 * 32); + reg5 = LD_SH(tmp_buf + 21 * 32); + reg6 = LD_SH(tmp_buf + 27 * 32); + reg7 = LD_SH(tmp_buf + 29 * 32); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + SRARI_H4_SH(m0, m2, m4, m6, 6); + AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); + SRARI_H4_SH(m0, m2, m4, m6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, + m6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + SRARI_H4_SH(m1, m3, m5, m7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); + SRARI_H4_SH(m1, m3, m5, m7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, + m7); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + SRARI_H4_SH(n0, n2, n4, n6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); + SRARI_H4_SH(n0, n2, n4, n6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, + n6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + SRARI_H4_SH(n1, n3, n5, n7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); + SRARI_H4_SH(n1, n3, n5, n7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, + n7); +} + +static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); + } + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + for (i = 32; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out_ptr]) \n\t" + "sw $zero, 4(%[out_ptr]) \n\t" + "sw $zero, 8(%[out_ptr]) \n\t" + "sw $zero, 12(%[out_ptr]) \n\t" + "sw $zero, 16(%[out_ptr]) \n\t" + "sw $zero, 20(%[out_ptr]) \n\t" + "sw $zero, 24(%[out_ptr]) \n\t" + "sw $zero, 28(%[out_ptr]) \n\t" + "sw $zero, 32(%[out_ptr]) \n\t" + "sw $zero, 36(%[out_ptr]) \n\t" + "sw $zero, 40(%[out_ptr]) \n\t" + "sw $zero, 44(%[out_ptr]) \n\t" + "sw $zero, 48(%[out_ptr]) \n\t" + "sw $zero, 52(%[out_ptr]) \n\t" + "sw $zero, 56(%[out_ptr]) \n\t" + "sw $zero, 60(%[out_ptr]) \n\t" + + : + : [out_ptr] "r"(out_ptr)); + + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_msa(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 16; i--;) { + LD_UB2(dst, 16, dst0, dst1); + LD_UB2(dst + dst_stride, 16, dst2, dst3); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + + ST_UB2(tmp0, tmp1, dst, 16); + dst += dst_stride; + ST_UB2(tmp2, tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c new file mode 100644 index 000000000..274818baa --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct4x4_msa.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in2, in3, in1); + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + UNPCK_R_SH_SW(in0, in0_r); + UNPCK_R_SH_SW(in2, in2_r); + UNPCK_R_SH_SW(in3, in3_r); + UNPCK_R_SH_SW(in1, in1_r); + SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); + + in0_r += in2_r; + in3_r -= in1_r; + in4_r = (in0_r - in3_r) >> 1; + in1_r = in4_r - in1_r; + in2_r = in4_r - in2_r; + in0_r -= in1_r; + in3_r += in2_r; + + TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); + + in0_r += in1_r; + in2_r -= in3_r; + in4_r = (in0_r - in2_r) >> 1; + in3_r = in4_r - in3_r; + in1_r = in4_r - in1_r; + in0_r -= in3_r; + in2_r += in1_r; + + PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, + in2, in3); + ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); +} + +void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t a1, e1; + v8i16 in1, in0 = { 0 }; + + a1 = input[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + + in0 = __msa_insert_h(in0, 0, a1); + in0 = __msa_insert_h(in0, 1, e1); + in0 = __msa_insert_h(in0, 2, e1); + in0 = __msa_insert_h(in0, 3, e1); + + in1 = in0 >> 1; + in0 -= in1; + + ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); +} + +void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* rows */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 4); + vec = __msa_fill_h(out); + + ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); +} diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c new file mode 100644 index 000000000..981c103cd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct8x8_msa.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + /* rows transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 zero = { 0 }; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + /* stage1 */ + ILVL_H2_SH(in3, in0, in2, in1, s0, s1); + k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); + + /* stage2 */ + ILVR_H2_SH(in3, in1, in2, in0, s1, s0); + k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); + + /* stage3 */ + s0 = __msa_ilvr_h(s6, s5); + + k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); + SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); + + /* stage4 */ + BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6, + in7); + TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + int32_t val; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + val = ROUND_POWER_OF_TWO(out, 5); + vec = __msa_fill_h(val); + + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); +} diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c new file mode 100644 index 000000000..dc8f20208 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + "lb %[tmp9], 8(%[left]) \n\t" + "lb %[tmp10], 9(%[left]) \n\t" + "lb %[tmp11], 10(%[left]) \n\t" + "lb %[tmp12], 11(%[left]) \n\t" + "lb %[tmp13], 12(%[left]) \n\t" + "lb %[tmp14], 13(%[left]) \n\t" + "lb %[tmp15], 14(%[left]) \n\t" + "lb %[tmp16], 15(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + "replv.qb %[tmp9], %[tmp9] \n\t" + "replv.qb %[tmp10], %[tmp10] \n\t" + "replv.qb %[tmp11], %[tmp11] \n\t" + "replv.qb %[tmp12], %[tmp12] \n\t" + "replv.qb %[tmp13], %[tmp13] \n\t" + "replv.qb %[tmp14], %[tmp14] \n\t" + "replv.qb %[tmp15], %[tmp15] \n\t" + "replv.qb %[tmp16], %[tmp16] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "sw %[tmp1], 8(%[dst]) \n\t" + "sw %[tmp1], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "sw %[tmp2], 8(%[dst]) \n\t" + "sw %[tmp2], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "sw %[tmp3], 8(%[dst]) \n\t" + "sw %[tmp3], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "sw %[tmp4], 8(%[dst]) \n\t" + "sw %[tmp4], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "sw %[tmp5], 8(%[dst]) \n\t" + "sw %[tmp5], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "sw %[tmp6], 8(%[dst]) \n\t" + "sw %[tmp6], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "sw %[tmp7], 8(%[dst]) \n\t" + "sw %[tmp7], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + "sw %[tmp8], 8(%[dst]) \n\t" + "sw %[tmp8], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp9], (%[dst]) \n\t" + "sw %[tmp9], 4(%[dst]) \n\t" + "sw %[tmp9], 8(%[dst]) \n\t" + "sw %[tmp9], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp10], (%[dst]) \n\t" + "sw %[tmp10], 4(%[dst]) \n\t" + "sw %[tmp10], 8(%[dst]) \n\t" + "sw %[tmp10], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp11], (%[dst]) \n\t" + "sw %[tmp11], 4(%[dst]) \n\t" + "sw %[tmp11], 8(%[dst]) \n\t" + "sw %[tmp11], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp12], (%[dst]) \n\t" + "sw %[tmp12], 4(%[dst]) \n\t" + "sw %[tmp12], 8(%[dst]) \n\t" + "sw %[tmp12], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp13], (%[dst]) \n\t" + "sw %[tmp13], 4(%[dst]) \n\t" + "sw %[tmp13], 8(%[dst]) \n\t" + "sw %[tmp13], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp14], (%[dst]) \n\t" + "sw %[tmp14], 4(%[dst]) \n\t" + "sw %[tmp14], 8(%[dst]) \n\t" + "sw %[tmp14], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp15], (%[dst]) \n\t" + "sw %[tmp15], 4(%[dst]) \n\t" + "sw %[tmp15], 8(%[dst]) \n\t" + "sw %[tmp15], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp16], (%[dst]) \n\t" + "sw %[tmp16], 4(%[dst]) \n\t" + "sw %[tmp16], 8(%[dst]) \n\t" + "sw %[tmp16], 12(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), + [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), + [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), + [tmp16] "=&r"(tmp16) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, left2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "lw %[above1], 8(%[above]) \n\t" + "lw %[above2], 12(%[above]) \n\t" + "lw %[left1], 8(%[left]) \n\t" + "lw %[left2], 12(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addiu %[average], %[average], 16 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 5 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), + [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), + [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), + [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c new file mode 100644 index 000000000..ea7c02810 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "sw %[tmp1], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; + + __asm__ __volatile__( + "lw %[above_c], (%[above]) \n\t" + "lw %[left_c], (%[left]) \n\t" + + "preceu.ph.qbl %[above_l], %[above_c] \n\t" + "preceu.ph.qbr %[above_r], %[above_c] \n\t" + "preceu.ph.qbl %[left_l], %[left_c] \n\t" + "preceu.ph.qbr %[left_r], %[left_c] \n\t" + + "addu.ph %[average], %[above_r], %[above_l] \n\t" + "addu.ph %[average], %[average], %[left_l] \n\t" + "addu.ph %[average], %[average], %[left_r] \n\t" + "addiu %[average], %[average], 4 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 3 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + + : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), + [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), + [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t left0, left1, left2, left3; + int32_t res0, res1; + int32_t resl; + int32_t resr; + int32_t top_left; + uint8_t *cm = aom_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[resl], (%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + "lbu %[left1], 1(%[left]) \n\t" + "lbu %[left2], 2(%[left]) \n\t" + "lbu %[left3], 3(%[left]) \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + + "preceu.ph.qbl %[abovel], %[resl] \n\t" + "preceu.ph.qbr %[abover], %[resl] \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "replv.ph %[left1], %[left1] \n\t" + "replv.ph %[left2], %[left2] \n\t" + "replv.ph %[left3], %[left3] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[resl], %[abovel], %[left0] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left0] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left1] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left1] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left2] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left2] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left3] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left3] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0), + [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0), + [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl), + [resr] "=&r"(resr), [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c new file mode 100644 index 000000000..1114fbc00 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "preceu.ph.qbl %[above_l2], %[above2] \n\t" + "preceu.ph.qbr %[above_r2], %[above2] \n\t" + "preceu.ph.qbl %[left_l2], %[left2] \n\t" + "preceu.ph.qbr %[left_r2], %[left2] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addu.ph %[average], %[average], %[above_l2] \n\t" + "addu.ph %[average], %[average], %[above_r2] \n\t" + "addu.ph %[average], %[average], %[left_l2] \n\t" + "addu.ph %[average], %[average], %[left_r2] \n\t" + + "addiu %[average], %[average], 8 \n\t" + + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 4 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), + [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), + [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), + [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), + [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), + [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void aom_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t abovel_1, abover_1; + int32_t left0; + int32_t res0, res1, res2, res3; + int32_t reshw; + int32_t top_left; + uint8_t *cm = aom_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[reshw], (%[above]) \n\t" + "ulw %[top_left], 4(%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + + "preceu.ph.qbl %[abovel], %[reshw] \n\t" + "preceu.ph.qbr %[abover], %[reshw] \n\t" + "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" + "preceu.ph.qbr %[abover_1], %[top_left] \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + "replv.ph %[left0], %[left0] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 1(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 2(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 3(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 4(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 5(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 6(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 7(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), + [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1), + [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3), + [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw), + [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c new file mode 100644 index 000000000..e8eaec7a9 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ + { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ + } + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t out0, out1, out2, out3; + + out0 = src[0] * 0x01010101; + out1 = src[1] * 0x01010101; + out2 = src[2] * 0x01010101; + out3 = src[3] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0] * 0x0101010101010101ull; + out1 = src[1] * 0x0101010101010101ull; + out2 = src[2] * 0x0101010101010101ull; + out3 = src[3] * 0x0101010101010101ull; + out4 = src[4] * 0x0101010101010101ull; + out5 = src[5] * 0x0101010101010101ull; + out6 = src[6] * 0x0101010101010101ull; + out7 = src[7] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 8; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0; + v16i8 store, data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + + val0 = LW(src); + data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); + sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_w((v4i32)store, 0); + + SW4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0; + v16i8 store; + v16u8 data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src); + data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { + uint64_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + v16u8 data, out; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + data = LD_UB(src); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { + const v16u8 out = (v16u8)__msa_ldi_b(128); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 data0, data1, out; + v8u16 sum_h, sum_data0, sum_data1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src, 16, data0, data1); + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); + sum_h = sum_data0 + sum_data1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t row; + const v16u8 out = (v16u8)__msa_ldi_b(128); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val; + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16)__msa_fill_h(top_left); + val = LW(src_top_ptr); + src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); + + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val; + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + val = LD(src_top_ptr); + src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 2; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 4; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +static void intra_predict_tm_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top[-1]; + uint32_t loop_cnt; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + LD_SB2(src_top, 16, src_top0, src_top1); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 8; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} + +void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_4x4_msa(above, dst, y_stride); +} + +void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_8x8_msa(above, dst, y_stride); +} + +void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_16x16_msa(above, dst, y_stride); +} + +void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_32x32_msa(above, dst, y_stride); +} + +void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_4x4_msa(left, dst, y_stride); +} + +void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_8x8_msa(left, dst, y_stride); +} + +void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_16x16_msa(left, dst, y_stride); +} + +void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_32x32_msa(left, dst, y_stride); +} + +void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_4x4_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_32x32_msa(above, left, dst, y_stride); +} + +void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_4x4_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_8x8_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_16x16_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_32x32_msa(above, dst, y_stride); +} + +void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_4x4_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_8x8_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_16x16_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_32x32_msa(left, dst, y_stride); +} + +void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_4x4_msa(dst, y_stride); +} + +void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_8x8_msa(dst, y_stride); +} + +void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_16x16_msa(dst, y_stride); +} + +void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_32x32_msa(dst, y_stride); +} + +void aom_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_4x4_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_8x8_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_16x16_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_32x32_msa(above, left, dst, y_stride); +} diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h new file mode 100644 index 000000000..8a85e26f3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ +#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/inv_txfm.h" +#include "aom_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ + ({ \ + \ + int32_t tmp, out; \ + int dct_cost_rounding = DCT_CONST_ROUNDING; \ + int in = input; \ + \ + __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac1 " \ + " \n\t" \ + "mthi $zero, $ac1 " \ + " \n\t" \ + "madd $ac1, %[in], " \ + "%[cospi_16_64] \n\t" \ + "extp %[tmp], $ac1, " \ + "31 \n\t" \ + \ + /* out = dct_const_round_shift(out * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac2 " \ + " \n\t" \ + "mthi $zero, $ac2 " \ + " \n\t" \ + "madd $ac2, %[tmp], " \ + "%[cospi_16_64] \n\t" \ + "extp %[out], $ac2, " \ + "31 \n\t" \ + \ + : [tmp] "=&r"(tmp), [out] "=r"(out) \ + : [in] "r"(in), \ + [dct_cost_rounding] "r"(dct_cost_rounding), \ + [cospi_16_64] "r"(cospi_16_64)); \ + out; \ + }) + +void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output); +void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void iadst4_dspr2(const int16_t *input, int16_t *output); +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void iadst8_dspr2(const int16_t *input, int16_t *output); +void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); +void iadst16_dspr2(const int16_t *input, int16_t *output); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h new file mode 100644 index 000000000..122667aa8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_ +#define AOM_DSP_MIPS_INV_TXFM_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_dsp/txfm_common.h" + +#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } + +#define AOM_SET_COSPI_PAIR(c0_h, c1_h) \ + ({ \ + v8i16 out0_m, r0_m, r1_m; \ + \ + r0_m = __msa_fill_h(c0_h); \ + r1_m = __msa_fill_h(c1_h); \ + out0_m = __msa_ilvev_h(r1_m, r0_m); \ + \ + out0_m; \ + }) + +#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ + { \ + uint8_t *dst_m = (uint8_t *)(dst); \ + v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 zero_m = { 0 }; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ + ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ + res0_m, res1_m, res2_m, res3_m); \ + ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ + res2_m, res3_m); \ + CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ + PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ + } + +#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 c0_m, c1_m, c2_m, c3_m; \ + v8i16 step0_m, step1_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + step0_m = __msa_ilvr_h(in2, in0); \ + DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + \ + c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + step1_m = __msa_ilvr_h(in3, in1); \ + DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + \ + PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ + SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ + BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ + out0, out1, out2, out3); \ + } + +#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 res0_m, res1_m, c0_m, c1_m; \ + v8i16 k1_m, k2_m, k3_m, k4_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 int0_m, int1_m, int2_m, int3_m; \ + v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ + -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ + \ + SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ + ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ + int0_m = tmp2_m + tmp1_m; \ + \ + SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ + ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int1_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int2_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + c0_m = __msa_ilvev_h(c0_m, k1_m); \ + \ + res0_m = __msa_ilvr_h((in1), (in3)); \ + tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ + int3_m = tmp2_m + tmp0_m; \ + \ + res0_m = __msa_ilvr_h((in2), (in3)); \ + c1_m = __msa_ilvev_h(k4_m, k3_m); \ + \ + tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ + res1_m = __msa_ilvr_h((in0), (in2)); \ + c1_m = __msa_ilvev_h(k1_m, zero_m); \ + \ + tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ + int3_m += tmp2_m; \ + int3_m += tmp3_m; \ + \ + SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ + PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ + } + +#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ + ({ \ + v8i16 c0_m, c1_m; \ + \ + SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ + c0_m = __msa_ilvev_h(c1_m, c0_m); \ + \ + c0_m; \ + }) + +/* multiply and add macro */ +#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ + DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ + cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ + DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ + cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ + } + +/* idct 8x8 macro */ +#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ + cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ + \ + k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \ + k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \ + k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \ + k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \ + AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ + SUB2(in1, in3, in7, in5, res0_m, res1_m); \ + k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \ + k1_m = __msa_splati_h(mask_m, 4); \ + \ + ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ + DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + tp4_m = in1 + in3; \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ + tp7_m = in7 + in5; \ + k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ + BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ + BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ + out1, out2, out3, out4, out5, out6, out7); \ + } + +#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ + v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ + v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ + cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ + v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ + -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ + v8i16 mask3_m = { \ + -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ + }; \ + \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \ + k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \ + ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \ + ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \ + k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \ + ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \ + ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ + ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ + BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ + k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \ + ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ + r6_m, r7_m); \ + ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ + SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ + k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \ + k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \ + ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ + m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ + ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ + m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ + \ + out1 = -in1; \ + out3 = -in3; \ + out5 = -in5; \ + out7 = -in7; \ + } + +#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ + r12, r13, r14, r15, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ + v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ + v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ + v8i16 h8_m, h9_m, h10_m, h11_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m; \ + \ + /* stage 1 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ + MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ + MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ + MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ + g11_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ + MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ + g15_m); \ + \ + /* stage 2 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ + k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ + MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ + h3_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ + k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ + MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ + h6_m, h7_m); \ + BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ + BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ + h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ + \ + /* stage 3 */ \ + BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ + MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ + out7); \ + MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ + out13, out15); \ + \ + /* stage 4 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ + MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ + MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ + MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ + MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ + } + +void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output); +void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); +#endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c new file mode 100644 index 000000000..c63b1e857 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c @@ -0,0 +1,1190 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct16_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_10, step1_11, step1_12, step1_13; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + + for (i = no_rows; i--;) { + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 16)); + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step2_12] \n\t" + "add %[load5], %[load5], %[step2_15] \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step2_13] \n\t" + "add %[load6], %[load6], %[step2_14] \n\t" + "sh %[load5], 0(%[output]) \n\t" + "sh %[load6], 32(%[output]) \n\t" + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "add %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + "add %[load6], %[load6], %[step2_11] \n\t" + "sh %[load5], 192(%[output]) \n\t" + "sh %[load6], 224(%[output]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "sub %[load5], %[load5], %[step2_11] \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step2_9] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "sh %[load5], 256(%[output]) \n\t" + "sh %[load6], 288(%[output]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_14] \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_15] \n\t" + "sh %[load5], 448(%[output]) \n\t" + "sh %[load6], 480(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), + [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), + [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), + [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), + [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); + + __asm__ __volatile__( + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sh %[load5], 64(%[output]) \n\t" + "sh %[load6], 96(%[output]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sh %[load5], 128(%[output]) \n\t" + "sh %[load6], 160(%[output]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sh %[load5], 320(%[output]) \n\t" + "sh %[load6], 352(%[output]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sh %[load5], 384(%[output]) \n\t" + "sh %[load6], 416(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); + + input += 16; + output += 1; + } +} + +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_8, step1_9, step1_10, step1_11; + int step1_12, step1_13, step1_14, step1_15; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 16; ++i) { + dest_pix = (dest + i); + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + step1_8 = step2_8 + step2_11; + step1_9 = step2_9 + step2_10; + step1_14 = step2_13 + step2_14; + step1_15 = step2_12 + step2_15; + + __asm__ __volatile__( + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step1_15] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step1_14] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[load5], %[step1_9] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step1_8] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step1_8] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step1_9] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step1_14] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step1_15] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) + : + [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); + + input += 16; + } +} + +void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct16_rows_dspr2(input, out, 16); + + // Then transform columns and add to dest + idct16_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + idct16_rows_dspr2(input, outptr, 4); + + outptr += 4; + for (i = 0; i < 6; ++i) { + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 128(%[outptr]) \n\t" + "sw $zero, 160(%[outptr]) \n\t" + "sw $zero, 192(%[outptr]) \n\t" + "sw $zero, 224(%[outptr]) \n\t" + "sw $zero, 256(%[outptr]) \n\t" + "sw $zero, 288(%[outptr]) \n\t" + "sw $zero, 320(%[outptr]) \n\t" + "sw $zero, 352(%[outptr]) \n\t" + "sw $zero, 384(%[outptr]) \n\t" + "sw $zero, 416(%[outptr]) \n\t" + "sw $zero, 448(%[outptr]) \n\t" + "sw $zero, 480(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + outptr += 2; + } + + // Then transform columns + idct16_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst16_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c new file mode 100644 index 000000000..d469d1ad0 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c @@ -0,0 +1,1042 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; + int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; + int16_t step1_27, step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; + int16_t step3_28, step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i, temp21; + uint8_t *dest_pix, *dest_pix1; + const int const_2_power_13 = 8192; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 32; ++i) { + dest_pix = dest + i; + dest_pix1 = dest + i + 31 * dest_stride; + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + // stage 7 + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_2], %[step2_29] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_3], %[step2_28] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), + [step2_31] "r"(step2_31)); + + step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_6], %[step1_25] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_7], %[step1_24] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), + [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), + [step1_27] "r"(step1_27)); + + step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_10], %[step1_21] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_11], %[step1_20] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), + [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), + [step1_23] "r"(step1_23)); + + step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_14], %[step2_17] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_15], %[step2_16] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), + [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), + [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); + + step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + input += 32; + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c new file mode 100644 index 000000000..fa7703217 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +static void idct32_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int16_t step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int16_t step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int temp21; + int i; + const int const_2_power_13 = 8192; + const int32_t *input_int; + + for (i = no_rows; i--;) { + input_int = (const int32_t *)input; + + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | + input_int[12] | input_int[13] | input_int[14] | input_int[15])) { + input += 32; + + __asm__ __volatile__( + "sh $zero, 0(%[output]) \n\t" + "sh $zero, 64(%[output]) \n\t" + "sh $zero, 128(%[output]) \n\t" + "sh $zero, 192(%[output]) \n\t" + "sh $zero, 256(%[output]) \n\t" + "sh $zero, 320(%[output]) \n\t" + "sh $zero, 384(%[output]) \n\t" + "sh $zero, 448(%[output]) \n\t" + "sh $zero, 512(%[output]) \n\t" + "sh $zero, 576(%[output]) \n\t" + "sh $zero, 640(%[output]) \n\t" + "sh $zero, 704(%[output]) \n\t" + "sh $zero, 768(%[output]) \n\t" + "sh $zero, 832(%[output]) \n\t" + "sh $zero, 896(%[output]) \n\t" + "sh $zero, 960(%[output]) \n\t" + "sh $zero, 1024(%[output]) \n\t" + "sh $zero, 1088(%[output]) \n\t" + "sh $zero, 1152(%[output]) \n\t" + "sh $zero, 1216(%[output]) \n\t" + "sh $zero, 1280(%[output]) \n\t" + "sh $zero, 1344(%[output]) \n\t" + "sh $zero, 1408(%[output]) \n\t" + "sh $zero, 1472(%[output]) \n\t" + "sh $zero, 1536(%[output]) \n\t" + "sh $zero, 1600(%[output]) \n\t" + "sh $zero, 1664(%[output]) \n\t" + "sh $zero, 1728(%[output]) \n\t" + "sh $zero, 1792(%[output]) \n\t" + "sh $zero, 1856(%[output]) \n\t" + "sh $zero, 1920(%[output]) \n\t" + "sh $zero, 1984(%[output]) \n\t" + + : + : [output] "r"(output)); + + output += 1; + + continue; + } + + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 32)); + prefetch_load((const uint8_t *)(input + 48)); + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64) + + ); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + // final stage + output[0 * 32] = step1_0 + step2_31; + output[1 * 32] = step1_1 + step2_30; + output[2 * 32] = step1_2 + step2_29; + output[3 * 32] = step1_3 + step2_28; + output[4 * 32] = step1_4 + step1_27; + output[5 * 32] = step1_5 + step1_26; + output[6 * 32] = step1_6 + step1_25; + output[7 * 32] = step1_7 + step1_24; + output[8 * 32] = step1_8 + step1_23; + output[9 * 32] = step1_9 + step1_22; + output[10 * 32] = step1_10 + step1_21; + output[11 * 32] = step1_11 + step1_20; + output[12 * 32] = step1_12 + step2_19; + output[13 * 32] = step1_13 + step2_18; + output[14 * 32] = step1_14 + step2_17; + output[15 * 32] = step1_15 + step2_16; + output[16 * 32] = step1_15 - step2_16; + output[17 * 32] = step1_14 - step2_17; + output[18 * 32] = step1_13 - step2_18; + output[19 * 32] = step1_12 - step2_19; + output[20 * 32] = step1_11 - step1_20; + output[21 * 32] = step1_10 - step1_21; + output[22 * 32] = step1_9 - step1_22; + output[23 * 32] = step1_8 - step1_23; + output[24 * 32] = step1_7 - step1_24; + output[25 * 32] = step1_6 - step1_25; + output[26 * 32] = step1_5 - step1_26; + output[27 * 32] = step1_4 - step1_27; + output[28 * 32] = step1_3 - step2_28; + output[29 * 32] = step1_2 - step2_29; + output[30 * 32] = step1_1 - step2_30; + output[31 * 32] = step1_0 - step2_31; + + input += 32; + output += 1; + } +} + +void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 32); + + // Columns + aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 8); + + outptr += 8; + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + for (i = 0; i < 31; ++i) { + outptr += 32; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + } + + // Columns + aom_idct32_cols_add_blk_dspr2(out, dest, stride); +} + +void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c new file mode 100644 index 000000000..e6d0367cd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + + for (i = 4; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + + "add %[Temp1], %[step_1], %[step_2] \n\t" + "sh %[Temp1], 8(%[output]) \n\t" + + "sub %[Temp2], %[step_1], %[step_2] \n\t" + "sh %[Temp2], 16(%[output]) \n\t" + + "sub %[Temp3], %[step_0], %[step_3] \n\t" + "sh %[Temp3], 24(%[output]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input)); + + input += 4; + output += 1; + } +} + +void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 4; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), + [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); + + input += 4; + } +} + +void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + aom_idct4_rows_dspr2(input, outptr); + + // Columns + aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); + __asm__ __volatile__( + "addi %[out], %[out], 8 \n\t" + "sra %[a1], %[out], 4 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst4_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c new file mode 100644 index 000000000..0a20f76f2 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c @@ -0,0 +1,645 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + const int const_2_power_13 = 8192; + int Temp0, Temp1, Temp2, Temp3, Temp4; + int i; + + for (i = no_rows; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[Temp4], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[Temp4], %[Temp1] \n\t" + "sub %[step1_3], %[Temp4], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + "add %[Temp1], %[step1_1], %[step1_6] \n\t" + "sh %[Temp1], 16(%[output]) \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "sh %[Temp0], 32(%[output]) \n\t" + "add %[Temp1], %[step1_3], %[step1_4] \n\t" + "sh %[Temp1], 48(%[output]) \n\t" + + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "sh %[Temp0], 64(%[output]) \n\t" + "sub %[Temp1], %[step1_2], %[step1_5] \n\t" + "sh %[Temp1], 80(%[output]) \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "sh %[Temp0], 96(%[output]) \n\t" + "sub %[Temp1], %[step1_0], %[step1_7] \n\t" + "sh %[Temp1], 112(%[output]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), + [input] "r"(input)); + + input += 8; + output += 1; + } +} + +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int Temp0, Temp1, Temp2, Temp3; + int i; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 8; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[step1_6], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[step1_6], %[Temp1] \n\t" + "sub %[step1_3], %[step1_6], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* add block */ + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_0], %[step1_7] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); + + input += 8; + } +} + +void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 8); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 4); + + outptr += 4; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 48(%[outptr]) \n\t" + "sw $zero, 52(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 68(%[outptr]) \n\t" + "sw $zero, 80(%[outptr]) \n\t" + "sw $zero, 84(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 100(%[outptr]) \n\t" + "sw $zero, 112(%[outptr]) \n\t" + "sw $zero, 116(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t t1, t2, vector_a1, vector_1, vector_2; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 16 \n\t" + "sra %[a1], %[out], 5 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst8_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3, x4, x5, x6, x7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); + x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); + x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); + x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} +#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c new file mode 100644 index 000000000..fc0c32ce3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c @@ -0,0 +1,1487 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/mem.h" +#include "aom_dsp/mips/loopfilter_msa.h" + +int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 96); + + LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + src -= 3 * pitch; + ST_UB4(p2, p1, p0, q0, src, pitch); + src += (4 * pitch); + ST_UB2(q1, q2, src, pitch); + } else { + src -= 7 * pitch; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += pitch; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += pitch; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); + + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += pitch; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += pitch; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += pitch; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += pitch; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += pitch; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + } +} + +void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + (void)count; + + early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + aom_hz_lpf_t16_16w(src, pitch, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (1 == count) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + uint64_t dword0, dword1; + v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 p0_filter16, p1_filter16; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + v8u16 tmp0, tmp1, tmp2; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); + } else { + /* convert 8 bit input data into 16 bit */ + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + /* load 16 vector elements */ + LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); + LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); + SD(q1_d, src + pitch); + SD(q2_d, src + 2 * pitch); + } else { + /* LSB(right) 8 pixel operation */ + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, + zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, + q7_r); + + tmp0 = p7_r << 3; + tmp0 -= p7_r; + tmp0 += p6_r; + tmp0 += q0_r; + + src -= 7 * pitch; + + /* calculation of p6 and p5 */ + tmp1 = p6_r + p5_r + p4_r + p3_r; + tmp1 += (p2_r + p1_r + p0_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp0 = p5_r - p6_r + q1_r - p7_r; + tmp1 += tmp0; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p4 and p3 */ + tmp0 = p4_r - p5_r + q2_r - p7_r; + tmp2 = p3_r - p4_r + q3_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p2 and p1 */ + tmp0 = p2_r - p3_r + q4_r - p7_r; + tmp2 = p1_r - p2_r + q5_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p0 and q0 */ + tmp0 = (p0_r - p1_r) + (q6_r - p7_r); + tmp2 = (q7_r - p0_r) + (q0_r - p7_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q1 and q2 */ + tmp0 = q7_r - q0_r + q1_r - p6_r; + tmp2 = q7_r - q1_r + q2_r - p5_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q3 and q4 */ + tmp0 = (q7_r - q2_r) + (q3_r - p4_r); + tmp2 = (q7_r - q3_r) + (q4_r - p3_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q5 and q6 */ + tmp0 = (q7_r - q4_r) + (q5_r - p2_r); + tmp2 = (q7_r - q5_r) + (q6_r - p1_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + } + } + } else { + aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, + thresh_ptr, count); + } +} + +void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); +} + +void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; + v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, + p1_org, p0_org); + /* 8x8 transpose */ + TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, + p0_org, p7, p6, p5, p4, p3, p2, p1, p0); + /* 8x8 transpose */ + ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, + tmp0, tmp1, tmp2, tmp3); + ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); + ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); + ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); + ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); + SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); + TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, + q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); + ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); +} + +static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, + int32_t out_pitch) { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp2, tmp3; + + LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); + input += (8 * in_pitch); + LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); + q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); + q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); + q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); + q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); + q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); + q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); + q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); + + ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); + tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); + tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); + + ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); + tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); + tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); + + ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); + q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); + tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); + q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); + q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); + tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); + q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); + p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); + p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); + q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); + q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); + q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16i8 zero = { 0 }; + v16u8 filter8, flat, flat2; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 tmp0_r, tmp1_r; + v8i16 r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST8x1_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST8x1_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST8x1_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST8x1_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST8x1_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST8x1_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST8x1_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST8x1_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); + + early_exit = + aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); + } + } +} + +int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src_org -= 2; + ST4x8_UB(vec2, vec3, src_org, pitch); + src_org += 8 * pitch; + ST4x8_UB(vec4, vec5, src_org, pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)(tmp1_l), 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c new file mode 100644 index 000000000..dc0a97764 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); +} + +void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 mask, hev, flat, limit, thresh, b_limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v8i16 vec0, vec1, vec2, vec3; + + LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); +} + +void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, + row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c new file mode 100644 index 000000000..dc203e79c --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + src -= 3 * pitch; + + SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); + src += (4 * pitch); + SD(q1_d, src); + src += pitch; + SD(q2_d, src); + } +} + +void aom_lpf_horizontal_8_dual_msa( + uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + tmp = (v16u8)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + tmp = (v16u8)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + tmp = (v16u8)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4; + + /* load vector elements */ + LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, + p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + /* Store 6 pixels p2-_q2 */ + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src -= 3; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 4, src + 4, pitch); + } +} + +void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + vec0 = (v8i16)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + vec0 = (v8i16)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + vec0 = (v8i16)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + + /* filter8 */ + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c new file mode 100644 index 000000000..883d0523d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + /* loop filter designed to work using chars so that we can make maximum use + of 8 bit simd instructions. */ + for (i = 0; i < 2; i++) { + sm1 = s - (pitch << 2); + s0 = sm1 + pitch; + s1 = s0 + pitch; + s2 = s - pitch; + s3 = s; + s4 = s + pitch; + s5 = s4 + pitch; + s6 = s5 + pitch; + + __asm__ __volatile__( + "lw %[p1], (%[s1]) \n\t" + "lw %[p2], (%[s2]) \n\t" + "lw %[p3], (%[s3]) \n\t" + "lw %[p4], (%[s4]) \n\t" + + : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + mask will be zero and filtering is not needed */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + __asm__ __volatile__( + "lw %[pm1], (%[sm1]) \n\t" + "lw %[p0], (%[s0]) \n\t" + "lw %[p5], (%[s5]) \n\t" + "lw %[p6], (%[s6]) \n\t" + + : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) + : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); + + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + __asm__ __volatile__( + "sw %[p1], (%[s1]) \n\t" + "sw %[p2], (%[s2]) \n\t" + "sw %[p3], (%[s3]) \n\t" + "sw %[p4], (%[s4]) \n\t" + + : + : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), + [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s4] "r"(s4)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s2] "r"(s2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s1] "r"(s1)); + } + } + } +} + +void aom_lpf_horizontal_4_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_8_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); + aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h new file mode 100644 index 000000000..72df09823 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -0,0 +1,735 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* inputs & outputs are quad-byte vectors */ +static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *ps0 = vps0 ^ N128; + *ps1 = vps1 ^ N128; + *qs0 = vqs0 ^ N128; + *qs1 = vqs1 ^ N128; +} + +static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, + uint32_t ps0, uint32_t qs0, uint32_t qs1, + uint32_t *p1_f0, uint32_t *p0_f0, + uint32_t *q0_f0, uint32_t *q1_f0) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (ps0) ^ N128; + vps1 = (ps1) ^ N128; + vqs0 = (qs0) ^ N128; + vqs1 = (qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *p0_f0 = vps0 ^ N128; + *p1_f0 = vps1 ^ N128; + *q0_f0 = vqs0 ^ N128; + *q1_f0 = vqs1 ^ N128; +} + +static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, + uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; +} + +static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, uint32_t *op2_f1, + uint32_t *op1_f1, uint32_t *op0_f1, + uint32_t *oq0_f1, uint32_t *oq1_f1, + uint32_t *oq2_f1) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2_f1 = res_op2; + *op1_f1 = res_op1; + *op0_f1 = res_op0; + *oq0_f1 = res_oq0; + *oq1_f1 = res_oq1; + *oq2_f1 = res_oq2; +} + +static INLINE void wide_mbfilter_dspr2( + uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, + uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, + uint32_t *oq7) { + const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; + uint32_t tmp; + uint32_t add_p6toq6; + uint32_t u32Eight = 0x00080008; + + __asm__ __volatile__( + /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 + which is used most of the time */ + "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" + + : [add_p6toq6] "=&r"(add_p6toq6) + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), + [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [u32Eight] "r"(u32Eight)); + + __asm__ __volatile__( + /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + + p3 + p2 + p1 + p0 + q0, 4) */ + "shll.ph %[tmp], %[p7], 3 \n\t" + "subu.ph %[res_op6], %[tmp], %[p7] \n\t" + "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" + "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" + "shrl.ph %[res_op6], %[res_op6], 4 \n\t" + + /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + + p2 + p1 + p0 + q0 + q1, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op5], %[tmp], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" + "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" + "shrl.ph %[res_op5], %[res_op5], 4 \n\t" + + /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + + p1 + p0 + q0 + q1 + q2, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op4], %[tmp], %[p7] \n\t" + "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" + "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" + "shrl.ph %[res_op4], %[res_op4], 4 \n\t" + + /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + + p1 + p0 + q0 + q1 + q2 + q3, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op3], %[tmp], %[p3] \n\t" + "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" + "shrl.ph %[res_op3], %[res_op3], 4 \n\t" + + /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + + p0 + q0 + q1 + q2 + q3 + q4, 4) */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p7] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" + "shrl.ph %[res_op2], %[res_op2], 4 \n\t" + + /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op1], %[tmp], %[p1] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" + "shrl.ph %[res_op1], %[res_op1], 4 \n\t" + + /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ + "addu.ph %[res_op0], %[p7], %[p0] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" + "shrl.ph %[res_op0], %[res_op0], 4 \n\t" + + : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), + [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [add_p6toq6] "r"(add_p6toq6)); + + *op6 = res_op6; + *op5 = res_op5; + *op4 = res_op4; + *op3 = res_op3; + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + + __asm__ __volatile__( + /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ + "addu.ph %[res_oq0], %[q7], %[q0] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" + + /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" + + /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 * 3, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" + + /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" + "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" + + /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + + q4 * 2 + q5 + q6 + q7 * 5, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" + "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" + + /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + + q5 * 2 + q6 + q7 * 6, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" + "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" + + /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + + q4 + q5 + q6 * 2 + q7 * 7, 4) */ + "shll.ph %[tmp], %[q7], 3 \n\t" + "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" + "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" + + : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), + [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), + [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), + [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) + : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), + [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), + [add_p6toq6] "r"(add_p6toq6)); + + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; + *oq3 = res_oq3; + *oq4 = res_oq4; + *oq5 = res_oq5; + *oq6 = res_oq6; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h new file mode 100644 index 000000000..3e6994714 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define STORE_F0() \ + { \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s4]) \n\t" \ + "sb %[q0_f0], 0(%[s4]) \n\t" \ + "sb %[p0_f0], -1(%[s4]) \n\t" \ + "sb %[p1_f0], -2(%[s4]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s3]) \n\t" \ + "sb %[q0_f0], 0(%[s3]) \n\t" \ + "sb %[p0_f0], -1(%[s3]) \n\t" \ + "sb %[p1_f0], -2(%[s3]) \n\t" \ + \ + : [p1_f0] "+r"(p1_f0) \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ + [p0_f0] "r"(p0_f0)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s2]) \n\t" \ + "sb %[q0_f0], 0(%[s2]) \n\t" \ + "sb %[p0_f0], -1(%[s2]) \n\t" \ + "sb %[p1_f0], -2(%[s2]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s1]) \n\t" \ + "sb %[q0_f0], 0(%[s1]) \n\t" \ + "sb %[p0_f0], -1(%[s1]) \n\t" \ + "sb %[p1_f0], -2(%[s1]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ + } + +#define STORE_F1() \ + { \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + \ + : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ + [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + \ + : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ + [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ + } + +#define STORE_F2() \ + { \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s4]) \n\t" \ + "sb %[q5_r], 5(%[s4]) \n\t" \ + "sb %[q4_r], 4(%[s4]) \n\t" \ + "sb %[q3_r], 3(%[s4]) \n\t" \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + "sb %[p3_r], -4(%[s4]) \n\t" \ + "sb %[p4_r], -5(%[s4]) \n\t" \ + "sb %[p5_r], -6(%[s4]) \n\t" \ + "sb %[p6_r], -7(%[s4]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_r], %[q6_r], 16 \n\t" \ + "srl %[q5_r], %[q5_r], 16 \n\t" \ + "srl %[q4_r], %[q4_r], 16 \n\t" \ + "srl %[q3_r], %[q3_r], 16 \n\t" \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + "srl %[p3_r], %[p3_r], 16 \n\t" \ + "srl %[p4_r], %[p4_r], 16 \n\t" \ + "srl %[p5_r], %[p5_r], 16 \n\t" \ + "srl %[p6_r], %[p6_r], 16 \n\t" \ + \ + : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ + [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ + [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ + [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ + [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s3]) \n\t" \ + "sb %[q5_r], 5(%[s3]) \n\t" \ + "sb %[q4_r], 4(%[s3]) \n\t" \ + "sb %[q3_r], 3(%[s3]) \n\t" \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + "sb %[p3_r], -4(%[s3]) \n\t" \ + "sb %[p4_r], -5(%[s3]) \n\t" \ + "sb %[p5_r], -6(%[s3]) \n\t" \ + "sb %[p6_r], -7(%[s3]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s2]) \n\t" \ + "sb %[q5_l], 5(%[s2]) \n\t" \ + "sb %[q4_l], 4(%[s2]) \n\t" \ + "sb %[q3_l], 3(%[s2]) \n\t" \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + "sb %[p3_l], -4(%[s2]) \n\t" \ + "sb %[p4_l], -5(%[s2]) \n\t" \ + "sb %[p5_l], -6(%[s2]) \n\t" \ + "sb %[p6_l], -7(%[s2]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_l], %[q6_l], 16 \n\t" \ + "srl %[q5_l], %[q5_l], 16 \n\t" \ + "srl %[q4_l], %[q4_l], 16 \n\t" \ + "srl %[q3_l], %[q3_l], 16 \n\t" \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + "srl %[p3_l], %[p3_l], 16 \n\t" \ + "srl %[p4_l], %[p4_l], 16 \n\t" \ + "srl %[p5_l], %[p5_l], 16 \n\t" \ + "srl %[p6_l], %[p6_l], 16 \n\t" \ + \ + : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ + [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ + [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ + [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ + [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s1]) \n\t" \ + "sb %[q5_l], 5(%[s1]) \n\t" \ + "sb %[q4_l], 4(%[s1]) \n\t" \ + "sb %[q3_l], 3(%[s1]) \n\t" \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + "sb %[p3_l], -4(%[s1]) \n\t" \ + "sb %[p4_l], -5(%[s1]) \n\t" \ + "sb %[p5_l], -6(%[s1]) \n\t" \ + "sb %[p6_l], -7(%[s1]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ + } + +#define PACK_LEFT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ + "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ + "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ + "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ + "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ + "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ + "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ + "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ + \ + : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ + [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ + [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_LEFT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ + "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ + "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ + "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ + "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ + "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ + "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ + "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ + \ + : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ + [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ + [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define PACK_RIGHT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ + "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ + "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ + "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ + "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ + "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ + "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ + "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ + \ + : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ + [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ + [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_RIGHT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ + "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ + "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ + "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ + "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ + "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ + "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ + "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ + \ + : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ + [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ + [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define COMBINE_LEFT_RIGHT_0TO2() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ + "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ + "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ + "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ + "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ + "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ + \ + : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ + [q1] "=&r"(q1), [q2] "=&r"(q2) \ + : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ + [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ + [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ + [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ + } + +#define COMBINE_LEFT_RIGHT_3TO6() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ + "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ + "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ + "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ + "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ + "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ + "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ + "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ + \ + : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ + [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ + [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ + [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ + [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ + [q6_r] "r"(q6_r)); \ + } + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h new file mode 100644 index 000000000..8db3e521f --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* processing 4 pixels at the same time + * compute hev and mask in the same function */ +static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, + uint32_t p1, uint32_t p0, uint32_t p3, + uint32_t p2, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t thresh, uint32_t *hev, + uint32_t *mask) { + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; +} + +static INLINE void filter_hev_mask_flatmask4_dspr2( + uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { + uint32_t c, r, r3, r_k, r_flat; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t hev1; + uint32_t flat1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + * flat |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + * flat |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + /* look at stall here */ + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), + [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; + *flat = flat1; +} + +static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t q4, uint32_t *flat2) { + uint32_t c, r, r_k, r_flat; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t flat1, flat3; + + __asm__ __volatile__( + /* flat |= (abs(p4 - p0) > thresh) */ + "subu_s.qb %[c], %[p4], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* flat |= (abs(q4 - q0) > thresh) */ + "subu_s.qb %[c], %[q4], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + "wrdsp %[r] \n\t" + "pick.qb %[flat3], $0, %[ones] \n\t" + + /* flat |= (abs(p1 - p0) > thresh) */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* flat |= (abs(q1 - q0) > thresh) */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ + "and %[flat1], %[flat3], %[flat1] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), + [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), + [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + *flat2 = flat1; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c new file mode 100644 index 000000000..a3b5a9eb1 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c @@ -0,0 +1,589 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint32_t mask; + uint32_t hev, flat; + uint8_t i; + uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < 2; i++) { + sp3 = s - (pitch << 2); + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + + __asm__ __volatile__( + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat != 0) && (mask != 0)) { + /* filtering */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p3, p2, p1, p0, q3, q2, q1, q0; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat != 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c new file mode 100644 index 000000000..8d2fd69f7 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { + uint32_t mask; + uint32_t hev, flat, flat2; + uint8_t i; + uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; + uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < (2 * count); i++) { + sp7 = s - (pitch << 3); + sp6 = sp7 + pitch; + sp5 = sp6 + pitch; + sp4 = sp5 + pitch; + sp3 = sp4 + pitch; + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + sq4 = sq3 + pitch; + sq5 = sq4 + pitch; + sq6 = sq5 + pitch; + sq7 = sq6 + pitch; + + __asm__ __volatile__( + "lw %[p7], (%[sp7]) \n\t" + "lw %[p6], (%[sp6]) \n\t" + "lw %[p5], (%[sp5]) \n\t" + "lw %[p4], (%[sp4]) \n\t" + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); + + __asm__ __volatile__( + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + "lw %[q4], (%[sq4]) \n\t" + "lw %[q5], (%[sq5]) \n\t" + "lw %[q6], (%[sq6]) \n\t" + "lw %[q7], (%[sq7]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), + [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + COMBINE_LEFT_RIGHT_0TO2() + COMBINE_LEFT_RIGHT_3TO6() + + __asm__ __volatile__( + "sw %[p6], (%[sp6]) \n\t" + "sw %[p5], (%[sp5]) \n\t" + "sw %[p4], (%[sp4]) \n\t" + "sw %[p3], (%[sp3]) \n\t" + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + + : + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), + [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sw %[q6], (%[sq6]) \n\t" + "sw %[q5], (%[sq5]) \n\t" + "sw %[q4], (%[sq4]) \n\t" + "sw %[q3], (%[sq3]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + + : + : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), + [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), + [sq1] "r"(sq1), [sq0] "r"(sq0)); + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0+f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 + f2 */ + /* f0 function */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* f1 function */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + /* f2 function */ + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], (%[sp6]) \n\t" + "sb %[p5_r], (%[sp5]) \n\t" + "sb %[p4_r], (%[sp4]) \n\t" + "sb %[p3_r], (%[sp3]) \n\t" + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), + [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + "sb %[q3_r], (%[sq3]) \n\t" + "sb %[q4_r], (%[sq4]) \n\t" + "sb %[q5_r], (%[sq5]) \n\t" + "sb %[q6_r], (%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], (%[sp2]) \n\t" + "sb %[p1_r_f1], (%[sp1]) \n\t" + "sb %[p0_r_f1], (%[sp0]) \n\t" + "sb %[q0_r_f1], (%[sq0]) \n\t" + "sb %[q1_r_f1], (%[sq1]) \n\t" + "sb %[q2_r_f1], (%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), + [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), + [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], +1(%[sp6]) \n\t" + "sb %[p5_r], +1(%[sp5]) \n\t" + "sb %[p4_r], +1(%[sp4]) \n\t" + "sb %[p3_r], +1(%[sp3]) \n\t" + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + "sb %[q3_r], +1(%[sq3]) \n\t" + "sb %[q4_r], +1(%[sq4]) \n\t" + "sb %[q5_r], +1(%[sq5]) \n\t" + "sb %[q6_r], +1(%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], +1(%[sp2]) \n\t" + "sb %[p1_r_f1], +1(%[sp1]) \n\t" + "sb %[p0_r_f1], +1(%[sp0]) \n\t" + "sb %[q0_r_f1], +1(%[sq0]) \n\t" + "sb %[q1_r_f1], +1(%[sq1]) \n\t" + "sb %[q2_r_f1], +1(%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], +2(%[sp6]) \n\t" + "sb %[p5_l], +2(%[sp5]) \n\t" + "sb %[p4_l], +2(%[sp4]) \n\t" + "sb %[p3_l], +2(%[sp3]) \n\t" + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + "sb %[q3_l], +2(%[sq3]) \n\t" + "sb %[q4_l], +2(%[sq4]) \n\t" + "sb %[q5_l], +2(%[sq5]) \n\t" + "sb %[q6_l], +2(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +2(%[sp2]) \n\t" + "sb %[p1_l_f1], +2(%[sp1]) \n\t" + "sb %[p0_l_f1], +2(%[sp0]) \n\t" + "sb %[q0_l_f1], +2(%[sq0]) \n\t" + "sb %[q1_l_f1], +2(%[sq1]) \n\t" + "sb %[q2_l_f1], +2(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], +3(%[sp6]) \n\t" + "sb %[p5_l], +3(%[sp5]) \n\t" + "sb %[p4_l], +3(%[sp4]) \n\t" + "sb %[p3_l], +3(%[sp3]) \n\t" + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + "sb %[q3_l], +3(%[sq3]) \n\t" + "sb %[q4_l], +3(%[sq4]) \n\t" + "sb %[q5_l], +3(%[sq5]) \n\t" + "sb %[q6_l], +3(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), + [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +3(%[sp2]) \n\t" + "sb %[p1_l_f1], +3(%[sp1]) \n\t" + "sb %[p0_l_f1], +3(%[sp0]) \n\t" + "sb %[q0_l_f1], +3(%[sq0]) \n\t" + "sb %[q1_l_f1], +3(%[sq1]) \n\t" + "sb %[q2_l_f1], +3(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c new file mode 100644 index 000000000..28528869b --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat, flat2; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[p4], -8(%[s1]) \n\t" + "lw %[p5], -8(%[s2]) \n\t" + "lw %[p6], -8(%[s3]) \n\t" + "lw %[p7], -8(%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + __asm__ __volatile__( + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + "lw %[q7], +4(%[s1]) \n\t" + "lw %[q6], +4(%[s2]) \n\t" + "lw %[q5], +4(%[s3]) \n\t" + "lw %[q4], +4(%[s4]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p7, p6, p5, p4 + original (when loaded from memory) + register -8 -7 -6 -5 + p4 p4_0 p4_1 p4_2 p4_3 + p5 p5_0 p5_1 p5_2 p5_3 + p6 p6_0 p6_1 p6_2 p6_3 + p7 p7_0 p7_1 p7_2 p7_3 + + after transpose + register + p4 p7_3 p6_3 p5_3 p4_3 + p5 p7_2 p6_2 p5_2 p4_2 + p6 p7_1 p6_1 p5_1 p4_1 + p7 p7_0 p6_0 p5_0 p4_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" + "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p7], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), + [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q4, q5, q6, q7 + original (when loaded from memory) + register +5 +6 +7 +8 + q7 q7_0 q7_1 q7_2 q7_3 + q6 q6_0 q6_1 q6_2 q6_3 + q5 q5_0 q5_1 q5_2 q5_3 + q4 q4_0 q4_1 q4_2 q4_3 + + after transpose + register + q7 q4_3 q5_3 q26_3 q7_3 + q6 q4_2 q5_2 q26_2 q7_2 + q5 q4_1 q5_1 q26_1 q7_1 + q4 q4_0 q5_0 q26_0 q7_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" + "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" + "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" + "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" + + "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" + "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" + "append %[q6], %[sec3], 16 \n\t" + "append %[q4], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), + [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + STORE_F2() + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0+f1+f2 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s4]) \n\t" + "sb %[p5_r], -6(%[s4]) \n\t" + "sb %[p4_r], -5(%[s4]) \n\t" + "sb %[p3_r], -4(%[s4]) \n\t" + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s4] "r"(s4)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + "sb %[q3_r], +3(%[s4]) \n\t" + "sb %[q4_r], +4(%[s4]) \n\t" + "sb %[q5_r], +5(%[s4]) \n\t" + "sb %[q6_r], +6(%[s4]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s4] "r"(s4)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s4]) \n\t" + "sb %[p1_r_f1], -2(%[s4]) \n\t" + "sb %[p0_r_f1], -1(%[s4]) \n\t" + "sb %[q0_r_f1], (%[s4]) \n\t" + "sb %[q1_r_f1], +1(%[s4]) \n\t" + "sb %[q2_r_f1], +2(%[s4]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), + [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), + [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s3]) \n\t" + "sb %[p5_r], -6(%[s3]) \n\t" + "sb %[p4_r], -5(%[s3]) \n\t" + "sb %[p3_r], -4(%[s3]) \n\t" + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s3] "r"(s3)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + "sb %[q3_r], +3(%[s3]) \n\t" + "sb %[q4_r], +4(%[s3]) \n\t" + "sb %[q5_r], +5(%[s3]) \n\t" + "sb %[q6_r], +6(%[s3]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s3] "r"(s3)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s3]) \n\t" + "sb %[p1_r_f1], -2(%[s3]) \n\t" + "sb %[p0_r_f1], -1(%[s3]) \n\t" + "sb %[q0_r_f1], (%[s3]) \n\t" + "sb %[q1_r_f1], +1(%[s3]) \n\t" + "sb %[q2_r_f1], +2(%[s3]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s2]) \n\t" + "sb %[p5_l], -6(%[s2]) \n\t" + "sb %[p4_l], -5(%[s2]) \n\t" + "sb %[p3_l], -4(%[s2]) \n\t" + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s2] "r"(s2)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + "sb %[q3_l], +3(%[s2]) \n\t" + "sb %[q4_l], +4(%[s2]) \n\t" + "sb %[q5_l], +5(%[s2]) \n\t" + "sb %[q6_l], +6(%[s2]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s2] "r"(s2)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s2]) \n\t" + "sb %[p1_l_f1], -2(%[s2]) \n\t" + "sb %[p0_l_f1], -1(%[s2]) \n\t" + "sb %[q0_l_f1], (%[s2]) \n\t" + "sb %[q1_l_f1], +1(%[s2]) \n\t" + "sb %[q2_l_f1], +2(%[s2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s1]) \n\t" + "sb %[p5_l], -6(%[s1]) \n\t" + "sb %[p4_l], -5(%[s1]) \n\t" + "sb %[p3_l], -4(%[s1]) \n\t" + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s1] "r"(s1)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], 1(%[s1]) \n\t" + "sb %[q2_l], 2(%[s1]) \n\t" + "sb %[q3_l], 3(%[s1]) \n\t" + "sb %[q4_l], 4(%[s1]) \n\t" + "sb %[q5_l], 5(%[s1]) \n\t" + "sb %[q6_l], 6(%[s1]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s1] "r"(s1)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s1]) \n\t" + "sb %[p1_l_f1], -2(%[s1]) \n\t" + "sb %[p0_l_f1], -1(%[s1]) \n\t" + "sb %[q0_l_f1], (%[s1]) \n\t" + "sb %[q1_l_f1], +1(%[s1]) \n\t" + "sb %[q2_l_f1], +2(%[s1]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h new file mode 100644 index 000000000..450594262 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_LOOPFILTER_MSA_H_ +#define AOM_DSP_LOOPFILTER_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" + +#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt = filt & (v16i8)hev_in; \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ + \ + filt = filt & (v16i8)mask_in; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8)hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ + filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ + filt = filt & (v16i8)mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } + +#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + { \ + v16u8 tmp_flat5, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } + +#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + { \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ + \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + } + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8)flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ + } +#endif /* AOM_DSP_LOOPFILTER_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h new file mode 100644 index 000000000..48fbcfd47 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/macros_msa.h @@ -0,0 +1,2057 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_MACROS_MSA_H_ +#define AOM_DSP_MIPS_MACROS_MSA_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#if (__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint64_t val_m = (val); \ + \ + __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } +#else // !(__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m_combined = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m_combined = (uint64_t)(val1_m); \ + val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ + val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ + \ + val_m_combined; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m1 = (uint8_t *)(pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1, out2, out3 + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) \ + { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ + } + +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) \ + { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ + } +#define LD4(psrc, stride, out0, out1, out2, out3) \ + { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ + } + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) \ + { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ + } + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) \ + { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ + } + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + } +#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + } +#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) + +#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ + { \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + } +#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) + +#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load 8 halfword elements in 'out0' from (psrc) + Load 8 halfword elements in 'out1' from (psrc + stride) +*/ +#define LD_H2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_H(RTYPE, (psrc)); \ + out1 = LD_H(RTYPE, (psrc) + (stride)); \ + } +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_H2(RTYPE, (psrc), stride, out0, out1); \ + LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) + +#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) + +#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + out7); \ + LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + out13, out14, out15); \ + } +#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) + +/* Description : Load 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Input - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH(psrc, out0, out1, out2, out3) \ + { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + } + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) \ + { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ + } + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) + +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) \ + { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ + } + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB(in, stidx, pdst, stride) \ + { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ + } + +/* Description : Store 4x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to the GP + register and stored to (pdst) + Index 1 word element from 'in' vector is copied to the GP + register and stored to (pdst + stride) +*/ +#define ST4x2_UB(in, pdst, stride) \ + { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ + } + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ + { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ + } +#define ST4x8_UB(in0, in1, pdst, stride) \ + { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ + } + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) +*/ +#define ST8x1_UB(in, pdst) \ + { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ + } + +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ + { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ + } + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) \ + { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ + } + +/* Description : average with rounding (in0 + in1 + 1) / 2. + Arguments : Inputs - in0, in1, in2, in3, + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned byte element from 'in0' vector is added with + each unsigned byte element from 'in1' vector. Then the average + with rounding is calculated and written to 'out0' +*/ +#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ + out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ + } +#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) + +#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ + } +#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slid into 'in0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ + { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ + } +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ + slide_val) \ + { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ + } +#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ + } +#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) +#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ + out2, slide_val) \ + { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ + } +#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) + +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ + out3) \ + { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ + } +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ + } +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ + } +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ + } +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ + } +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ + } +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ + } +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of input + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ + } +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + +/* Description : Minimum values between unsigned elements of + either vector are copied to the output vector + Arguments : Inputs - in0, in1, min_vec + Outputs - in place operation + Return Type - as per RTYPE + Details : Minimum of unsigned halfword element values from 'in0' and + 'min_vec' are written to output vector 'in0' +*/ +#define MIN_UH2(RTYPE, in0, in1, min_vec) \ + { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ + } +#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) + +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ + { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ + } +#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) \ + ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ + }) +#define CLIP_SH2_0_255(in0, in1) \ + { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ + } +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ + { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ + } + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Inputs - in (unsigned halfword vector) + Outputs - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of input vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + v2u64 res0_m, res1_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + res0_m = __msa_hadd_u_d(res_m, res_m); \ + res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ + } +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : SAD (Sum of Absolute Difference) + Arguments : Inputs - in0, in1, ref0, ref1 + Outputs - sad_m (halfword vector) + Return Type - unsigned halfword + Details : Absolute difference of all the byte elements from 'in0' with + 'ref0' is calculated and preserved in 'diff0'. Then even-odd + pairs are added together to generate 8 halfword results. +*/ +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ + ({ \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ + diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ + \ + sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ + sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ + \ + sad_m; \ + }) + +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ + } +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_W2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + } +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ + } +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) + +#define INSERT_D2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ + } +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + } +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ + } +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) +#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ + } +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ + } +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) +#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ + out5, out6, out7) \ + { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ + out6, out7); \ + } +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) + +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) + +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ + } +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ + } +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) + +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + } +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + } +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ + } +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) + +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ + } +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range + The results are written in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ + } +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ + } +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ + } +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ + { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ + } +#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ + } +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ + } +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ + } +#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128(RTYPE, in0, in1) \ + { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ + } +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ + } +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ + } +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ + { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ + } +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) + +/* Description : Average of signed halfword elements -> (a + b) / 2 + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each signed halfword element from 'in0' is added to each + signed halfword element of 'in1' with full precision resulting + in one extra bit in the result. The result is then divided by + 2 and written to 'out0' +*/ +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ + } +#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between halfword data type range +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ + } +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) + +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ + } + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ + } + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ + } + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ + } +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ + } +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ + } +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +#define SRARI_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ + } +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ + } +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Logical shift right all elements of vector (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is an immediate value. +*/ +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ + { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ + } +#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + } +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + } +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ + } + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) \ + { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ + } + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ + } + +/* Description : Sign extend halfword elements from input vector and return + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ + } + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ + } + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ + } + +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, \ + out4, out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ + } + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ + tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ + } +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + } + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ + } + +/* Description : Transpose 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ + tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ + } + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } + +/* Description : Transpose 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ + tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ + } +#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ + } + +/* Description : Add block 4x4 + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and stored. +*/ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ + { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ + } + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB(in0, in1) \ + ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ + }) + +/* Description : Converts inputs to unsigned bytes, interleave, average & store + as 8x4 unsigned byte block + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, + pdst, stride +*/ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ + pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ + } + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB(in0, in1, pdst) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ + } + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + \ + tmp1_m; \ + }) +#endif /* AOM_DSP_MIPS_MACROS_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c new file mode 100644 index 000000000..258eb5c07 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/sad_msa.c @@ -0,0 +1,1529 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ + } +#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) + +static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + diff = __msa_asub_u_b(src, ref); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t sad = 0; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = HADD_UH_U32(sad0); + sad += HADD_UH_U32(sad1); + + return sad; +} + +static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + INSERT_W4_UB(src0, src1, src2, src3, src); + + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref00, ref11, ref22, ref33; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); + ref += (4 * ref_stride); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src, ref, ref0, ref1, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v4u32 sad; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); + ref0_4 = LD_UB(ref + 64); + ref += ref_stride; + + sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_SW_S32((v4i32)sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_SW_S32((v4i32)sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_SW_S32((v4i32)sad); +} + +static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, diff; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref00, ref11, ref22, ref33; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); + ref += (4 * ref_stride); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src, ref0, ref1, ref; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1; + v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); + sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); + sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); + sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); + sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + const uint8_t *src_dup, *ref_dup; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + v4u32 sad; + + src_dup = src; + ref_dup = ref; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); + ref += ref_stride; + + sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[3] = HADD_SW_S32(sad); + + sad0_0 = (v8u16)__msa_ldi_h(0); + sad0_1 = (v8u16)__msa_ldi_h(0); + sad1_0 = (v8u16)__msa_ldi_h(0); + sad1_1 = (v8u16)__msa_ldi_h(0); + sad2_0 = (v8u16)__msa_ldi_h(0); + sad2_1 = (v8u16)__msa_ldi_h(0); + sad3_0 = (v8u16)__msa_ldi_h(0); + sad3_1 = (v8u16)__msa_ldi_h(0); + + for (ht_cnt = 64; ht_cnt--;) { + LD_UB4(src_dup, 16, src0, src1, src2, src3); + src_dup += src_stride; + LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); + ref_dup += ref_stride; + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[4] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[5] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[6] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[7] = HADD_SW_S32(sad); +} + +static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + + LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref0_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref1_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref2_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref3_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref0_ptr += (4 * ref_stride); + LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); + ref1_ptr += (4 * ref_stride); + LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); + ref2_ptr += (4 * ref_stride); + LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); + ref3_ptr += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src, ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + + LD_UB2(ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + + LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad_array[0] = HADD_UH_U32(sad0_0); + sad_array[0] += HADD_UH_U32(sad0_1); + sad_array[1] = HADD_UH_U32(sad1_0); + sad_array[1] += HADD_UH_U32(sad1_1); + sad_array[2] = HADD_UH_U32(sad2_0); + sad_array[2] += HADD_UH_U32(sad2_1); + sad_array[3] = HADD_UH_U32(sad3_0); + sad_array[3] += HADD_UH_U32(sad3_1); +} + +static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff, pred, comp; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + comp = __msa_aver_u_b(pred, ref); + diff = __msa_asub_u_b(src, comp); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 diff0, diff1, pred0, pred1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); + sad += SAD_UB2_UH(src0, src1, diff0, diff1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3, comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); + LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); + ref += (4 * ref_stride); + + LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); + LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); + sec_pred += (4 * 32); + + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); + sad += SAD_UB2_UH(src4, src5, comp0, comp1); + AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); + sad += SAD_UB2_UH(src6, src7, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 comp0, comp1, comp2, comp3; + v16u8 pred0, pred1, pred2, pred3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v4u32 sad; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + } + + sad = __msa_hadd_u_w(sad0, sad0); + sad += __msa_hadd_u_w(sad1, sad1); + + return HADD_SW_S32(sad); +} + +#define AOM_SAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_4xHEIGHTx3_MSA(height) \ + void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx3_MSA(height) \ + void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx3_MSA(height) \ + void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx3_MSA(height) \ + void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx3_MSA(height) \ + void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_4xHEIGHTx8_MSA(height) \ + void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx8_MSA(height) \ + void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx8_MSA(height) \ + void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx8_MSA(height) \ + void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx8_MSA(height) \ + void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_4xHEIGHTx4D_MSA(height) \ + void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx4D_MSA(height) \ + void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx4D_MSA(height) \ + void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx4D_MSA(height) \ + void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx4D_MSA(height) \ + void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_AVGSAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +/* clang-format off */ +// 64x64 +AOM_SAD_64xHEIGHT_MSA(64) +AOM_SAD_64xHEIGHTx3_MSA(64) +AOM_SAD_64xHEIGHTx8_MSA(64) +AOM_SAD_64xHEIGHTx4D_MSA(64) +AOM_AVGSAD_64xHEIGHT_MSA(64) + +// 64x32 +AOM_SAD_64xHEIGHT_MSA(32) +AOM_SAD_64xHEIGHTx3_MSA(32) +AOM_SAD_64xHEIGHTx8_MSA(32) +AOM_SAD_64xHEIGHTx4D_MSA(32) +AOM_AVGSAD_64xHEIGHT_MSA(32) + +// 32x64 +AOM_SAD_32xHEIGHT_MSA(64) +AOM_SAD_32xHEIGHTx3_MSA(64) +AOM_SAD_32xHEIGHTx8_MSA(64) +AOM_SAD_32xHEIGHTx4D_MSA(64) +AOM_AVGSAD_32xHEIGHT_MSA(64) + +// 32x32 +AOM_SAD_32xHEIGHT_MSA(32) +AOM_SAD_32xHEIGHTx3_MSA(32) +AOM_SAD_32xHEIGHTx8_MSA(32) +AOM_SAD_32xHEIGHTx4D_MSA(32) +AOM_AVGSAD_32xHEIGHT_MSA(32) + +// 32x16 +AOM_SAD_32xHEIGHT_MSA(16) +AOM_SAD_32xHEIGHTx3_MSA(16) +AOM_SAD_32xHEIGHTx8_MSA(16) +AOM_SAD_32xHEIGHTx4D_MSA(16) +AOM_AVGSAD_32xHEIGHT_MSA(16) + +// 16x32 +AOM_SAD_16xHEIGHT_MSA(32) +AOM_SAD_16xHEIGHTx3_MSA(32) +AOM_SAD_16xHEIGHTx8_MSA(32) +AOM_SAD_16xHEIGHTx4D_MSA(32) +AOM_AVGSAD_16xHEIGHT_MSA(32) + +// 16x16 +AOM_SAD_16xHEIGHT_MSA(16) +AOM_SAD_16xHEIGHTx3_MSA(16) +AOM_SAD_16xHEIGHTx8_MSA(16) +AOM_SAD_16xHEIGHTx4D_MSA(16) +AOM_AVGSAD_16xHEIGHT_MSA(16) + +// 16x8 +AOM_SAD_16xHEIGHT_MSA(8) +AOM_SAD_16xHEIGHTx3_MSA(8) +AOM_SAD_16xHEIGHTx8_MSA(8) +AOM_SAD_16xHEIGHTx4D_MSA(8) +AOM_AVGSAD_16xHEIGHT_MSA(8) + +// 8x16 +AOM_SAD_8xHEIGHT_MSA(16) +AOM_SAD_8xHEIGHTx3_MSA(16) +AOM_SAD_8xHEIGHTx8_MSA(16) +AOM_SAD_8xHEIGHTx4D_MSA(16) +AOM_AVGSAD_8xHEIGHT_MSA(16) + +// 8x8 +AOM_SAD_8xHEIGHT_MSA(8) +AOM_SAD_8xHEIGHTx3_MSA(8) +AOM_SAD_8xHEIGHTx8_MSA(8) +AOM_SAD_8xHEIGHTx4D_MSA(8) +AOM_AVGSAD_8xHEIGHT_MSA(8) + +// 8x4 +AOM_SAD_8xHEIGHT_MSA(4) +AOM_SAD_8xHEIGHTx3_MSA(4) +AOM_SAD_8xHEIGHTx8_MSA(4) +AOM_SAD_8xHEIGHTx4D_MSA(4) +AOM_AVGSAD_8xHEIGHT_MSA(4) + +// 4x8 +AOM_SAD_4xHEIGHT_MSA(8) +AOM_SAD_4xHEIGHTx3_MSA(8) +AOM_SAD_4xHEIGHTx8_MSA(8) +AOM_SAD_4xHEIGHTx4D_MSA(8) +AOM_AVGSAD_4xHEIGHT_MSA(8) + +// 4x4 +AOM_SAD_4xHEIGHT_MSA(4) +AOM_SAD_4xHEIGHTx3_MSA(4) +AOM_SAD_4xHEIGHTx8_MSA(4) +AOM_SAD_4xHEIGHTx4D_MSA(4) +AOM_AVGSAD_4xHEIGHT_MSA(4) + /* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c new file mode 100644 index 000000000..3eb85107d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c @@ -0,0 +1,1795 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/variance.h" + +static const uint8_t bilinear_filters_msa[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 pred, src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref, pred; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + CALC_MSE_AVG_B(src0, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 filt0, out, ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4, out; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 ref = { 0 }; + v16u8 src2110, src4332; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out, ref = { 0 }; + v16u8 filt_vt, filt_hz, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt_vt, filt_hz, vec0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 out, pred, filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 out, pred, filt0; + v16u8 ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 pred0, pred1, pred2, pred3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, + tmp2, tmp3); + AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, + tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 out, pred, ref = { 0 }; + v16u8 src2110, src4332, filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, filt0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 out, pred, ref = { 0 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 pred0, pred1, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 out0, out1, out2, out3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) +/* clang-format on */ + +#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, ht, &diff); \ + } \ + } \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + int32_t xoffset, int32_t yoffset, + const uint8_t *ref_ptr, + int32_t ref_stride, uint32_t *sse, + const uint8_t *sec_pred) { + int32_t diff; + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + + if (yoffset) { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_hv_msa( + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, + v_filter, 64, &diff); + } else { + *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + v_filter, 64, &diff); + } + } else { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + h_filter, 64, &diff); + } else { + *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, + sec_pred, &diff); + } + } + + return VARIANCE_32Wx64H(*sse, diff); +} + +#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ + uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32) +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64) +/* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c new file mode 100644 index 000000000..37b89765d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/subtract_msa.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t src0, src1, src2, src3; + uint32_t pred0, pred1, pred2, pred3; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); +} + +static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t loop_cnt; + uint64_t src0, src1, pred0, pred1; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 4; loop_cnt--;) { + LD2(src_ptr, src_stride, src0, src1); + src_ptr += (2 * src_stride); + LD2(pred_ptr, pred_stride, pred0, pred1); + pred_ptr += (2 * pred_stride); + + INSERT_D2_SB(src0, src1, src); + INSERT_D2_SB(pred0, pred1, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff_ptr, diff_stride); + diff_ptr += (2 * diff_stride); + } +} + +static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + int8_t count; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (count = 2; count--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, + pred7); + pred += (8 * pred_stride); + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + } +} + +static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + LD_SB2(src, 16, src4, src5); + src += src_stride; + LD_SB2(src, 16, src6, src7); + src += src_stride; + + LD_SB2(pred, 16, pred0, pred1); + pred += pred_stride; + LD_SB2(pred, 16, pred2, pred3); + pred += pred_stride; + LD_SB2(pred, 16, pred4, pred5); + pred += pred_stride; + LD_SB2(pred, 16, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + } +} + +static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 32; loop_cnt--;) { + LD_SB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_SB4(src, 16, src4, src5, src6, src7); + src += src_stride; + + LD_SB4(pred, 16, pred0, pred1, pred2, pred3); + pred += pred_stride; + LD_SB4(pred, 16, pred4, pred5, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + } +} + +void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h new file mode 100644 index 000000000..cba5d4445 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ +#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + { \ + v8i16 k0_m = __msa_fill_h(cnst0); \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + s0_m = (v4i32)__msa_fill_h(cnst1); \ + k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + \ + ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + } + +#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \ + dst1, dst2, dst3) \ + { \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \ + tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \ + tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \ + dst1, dst2, dst3); \ + } + +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ + ({ \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ + \ + dst_m; \ + }) + +#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \ + { \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \ + madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ + } + +#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \ + cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \ + cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ + } +#endif // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c new file mode 100644 index 000000000..745fdfc9c --- /dev/null +++ b/third_party/aom/aom_dsp/mips/variance_msa.c @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define CALC_MSE_B(src, ref, var) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + } + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + int32_t ht_cnt; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t get_mb_ss_msa(const int16_t *src) { + uint32_t sum, cnt; + v8i16 src0, src1, src2, src3; + v4i32 src0_l, src1_l, src2_l, src3_l; + v4i32 src0_r, src1_r, src2_r, src3_r; + v2i64 sq_src_l = { 0 }; + v2i64 sq_src_r = { 0 }; + + for (cnt = 8; cnt--;) { + LD_SH4(src, 8, src0, src1, src2, src3); + src += 4 * 8; + + UNPCK_SH_SW(src0, src0_l, src0_r); + UNPCK_SH_SW(src1, src1_l, src1_r); + UNPCK_SH_SW(src2, src2_l, src2_r); + UNPCK_SH_SW(src3, src3_l, src3_r); + + DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r); + } + + sq_src_l += __msa_splati_d(sq_src_l, 1); + sq_src_r += __msa_splati_d(sq_src_r, 1); + + sum = __msa_copy_s_d(sq_src_l, 0); + sum += __msa_copy_s_d(sq_src_r, 0); + + return sum; +} + +static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src, ref; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + } + + return HADD_SW_S32(var); +} + +uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride) { + uint32_t err = 0; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16i8 src = { 0 }; + v16i8 ref = { 0 }; + v16u8 src_vec0, src_vec1; + v8i16 diff0, diff1; + v4i32 err0 = { 0 }; + v4i32 err1 = { 0 }; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); + ILVRL_B2_UB(src, ref, src_vec0, src_vec1); + HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); + DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); + err = HADD_SW_S32(err0); + err += HADD_SW_S32(err1); + + return err; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_VARIANCE_WDXHT_MSA(4, 4) +AOM_VARIANCE_WDXHT_MSA(4, 8) + +AOM_VARIANCE_WDXHT_MSA(8, 4) +AOM_VARIANCE_WDXHT_MSA(8, 8) +AOM_VARIANCE_WDXHT_MSA(8, 16) + +AOM_VARIANCE_WDXHT_MSA(16, 8) +AOM_VARIANCE_WDXHT_MSA(16, 16) +AOM_VARIANCE_WDXHT_MSA(16, 32) + +AOM_VARIANCE_WDXHT_MSA(32, 16) +AOM_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_32Wx64H(*sse, diff); +} + +uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx32H(*sse, diff); +} + +uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); +} + +void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); +} + +uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h new file mode 100644 index 000000000..11a8c5ad7 --- /dev/null +++ b/third_party/aom/aom_dsp/postproc.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_POSTPROC_H_ +#define AOM_DSP_POSTPROC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// Fills a noise buffer with gaussian noise strength determined by sigma. +int aom_setup_noise(double sigma, int size, char *noise); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_DSP_POSTPROC_H_ diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c new file mode 100644 index 000000000..c60bfdac5 --- /dev/null +++ b/third_party/aom/aom_dsp/prob.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" + +#if CONFIG_EC_MULTISYMBOL +#include +#endif + +#include "aom_dsp/prob.h" + +const uint8_t aom_norm[256] = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static unsigned int tree_merge_probs_impl(unsigned int i, + const aom_tree_index *tree, + const aom_prob *pre_probs, + const unsigned int *counts, + aom_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = + (l <= 0) ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); + const int r = tree[i + 1]; + const unsigned int right_count = + (r <= 0) ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct); + return left_count + right_count; +} + +void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, + const unsigned int *counts, aom_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, probs); +} + +#if CONFIG_EC_MULTISYMBOL +typedef struct tree_node tree_node; + +struct tree_node { + aom_tree_index index; + uint8_t probs[16]; + uint8_t prob; + int path; + int len; + int l; + int r; + aom_cdf_prob pdf; +}; + +/* Compute the probability of this node in Q23 */ +static uint32_t tree_node_prob(tree_node n, int i) { + uint32_t prob; + /* 1.0 in Q23 */ + prob = 16777216; + for (; i < n.len; i++) { + prob = prob * n.probs[i] >> 8; + } + return prob; +} + +static int tree_node_cmp(tree_node a, tree_node b) { + int i; + uint32_t pa; + uint32_t pb; + for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) { + } + pa = tree_node_prob(a, i); + pb = tree_node_prob(b, i); + return pa > pb ? 1 : pa < pb ? -1 : 0; +} + +/* Given a Q15 probability for symbol subtree rooted at tree[n], this function + computes the probability of each symbol (defined as a node that has no + children). */ +static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n, + aom_cdf_prob pdf) { + if (tree[n].l == 0) { + /* This prevents probability computations in Q15 that underflow from + producing a symbol that has zero probability. */ + if (pdf == 0) pdf = 1; + tree[n].pdf = pdf; + return pdf; + } else { + /* We process the smaller probability first, */ + if (tree[n].prob < 128) { + aom_cdf_prob lp; + aom_cdf_prob rp; + lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8; + lp = tree_node_compute_probs(tree, tree[n].l, lp); + rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp); + return lp + rp; + } else { + aom_cdf_prob rp; + aom_cdf_prob lp; + rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8; + rp = tree_node_compute_probs(tree, tree[n].r, rp); + lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp); + return lp + rp; + } + } +} + +static int tree_node_extract(tree_node *tree, int n, int symb, + aom_cdf_prob *pdf, aom_tree_index *index, + int *path, int *len) { + if (tree[n].l == 0) { + pdf[symb] = tree[n].pdf; + if (index != NULL) index[symb] = tree[n].index; + if (path != NULL) path[symb] = tree[n].path; + if (len != NULL) len[symb] = tree[n].len; + return symb + 1; + } else { + symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len); + return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len); + } +} + +int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, + aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index, + int *path, int *len) { + tree_node symb[2 * 16 - 1]; + int nodes; + int next[16]; + int size; + int nsymbs; + int i; + /* Create the root node with probability 1 in Q15. */ + symb[0].index = root; + symb[0].path = 0; + symb[0].len = 0; + symb[0].l = symb[0].r = 0; + nodes = 1; + next[0] = 0; + size = 1; + nsymbs = 1; + while (size > 0 && nsymbs < 16) { + int m; + tree_node n; + aom_tree_index j; + uint8_t prob; + m = 0; + /* Find the internal node with the largest probability. */ + for (i = 1; i < size; i++) { + if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i; + } + i = next[m]; + memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1))); + size--; + /* Split this symbol into two symbols */ + n = symb[i]; + j = n.index; + prob = probs[j >> 1]; + /* Left */ + n.index = tree[j]; + n.path <<= 1; + n.len++; + n.probs[n.len - 1] = prob; + symb[nodes] = n; + if (n.index > 0) { + next[size++] = nodes; + } + /* Right */ + n.index = tree[j + 1]; + n.path += 1; + n.probs[n.len - 1] = 256 - prob; + symb[nodes + 1] = n; + if (n.index > 0) { + next[size++] = nodes + 1; + } + symb[i].prob = prob; + symb[i].l = nodes; + symb[i].r = nodes + 1; + nodes += 2; + nsymbs++; + } + /* Compute the probabilities of each symbol in Q15 */ + tree_node_compute_probs(symb, 0, CDF_PROB_TOP); + /* Extract the cdf, index, path and length */ + tree_node_extract(symb, 0, 0, cdf, index, path, len); + /* Convert to CDF */ + cdf[0] = AOM_ICDF(cdf[0]); + for (i = 1; i < nsymbs; i++) { + cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i - 1]) + cdf[i]); + } +// Store symbol count at the end of the CDF +#if CONFIG_EC_ADAPT + cdf[nsymbs] = 0; +#endif + return nsymbs; +} + +/* This code assumes that tree contains as unique leaf nodes the integer values + 0 to len - 1 and produces the forward and inverse mapping tables in ind[] + and inv[] respectively. */ +static void tree_to_index(int *stack_index, int *ind, int *inv, + const aom_tree_index *tree, int value, int index) { + value *= 2; + + do { + const aom_tree_index content = tree[index]; + ++index; + if (content <= 0) { + inv[*stack_index] = -content; + ind[-content] = *stack_index; + ++(*stack_index); + } else { + tree_to_index(stack_index, ind, inv, tree, value, content); + } + } while (++value & 1); +} + +void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree) { + int stack_index = 0; + tree_to_index(&stack_index, ind, inv, tree, 0, 0); +} +#endif diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h new file mode 100644 index 000000000..808592923 --- /dev/null +++ b/third_party/aom/aom_dsp/prob.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_PROB_H_ +#define AOM_DSP_PROB_H_ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_common.h" + +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" + +#if CONFIG_DAALA_EC +#include "aom_dsp/entcode.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint8_t aom_prob; + +// TODO(negge): Rename this aom_prob once we remove vpxbool. +typedef uint16_t aom_cdf_prob; + +#if CONFIG_EC_MULTISYMBOL +#define CDF_SIZE(x) ((x) + 1) +#endif + +#define CDF_PROB_BITS 15 +#define CDF_PROB_TOP (1 << CDF_PROB_BITS) + +#if CONFIG_DAALA_EC +#define AOM_ICDF OD_ICDF +#else +#define AOM_ICDF(x) (x) +#endif + +#define MAX_PROB 255 + +#define aom_prob_half ((aom_prob)128) + +typedef int8_t aom_tree_index; + +#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count)) + +#define MODE_MV_COUNT_SAT 20 + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of aom_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const aom_tree_index aom_tree[]; + +static INLINE aom_prob get_prob(unsigned int num, unsigned int den) { + assert(den != 0); + { + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); + return (aom_prob)clipped_prob; + } +} + +static INLINE aom_prob get_binary_prob(unsigned int n0, unsigned int n1) { + const unsigned int den = n0 + n1; + if (den == 0) return 128u; + return get_prob(n0, den); +} + +/* This function assumes prob1 and prob2 are already within [1,255] range. */ +static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) { + return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); +} + +static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + const aom_prob prob = get_binary_prob(ct[0], ct[1]); + const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat); + const unsigned int factor = max_update_factor * count / count_sat; + return weighted_prob(pre_prob, prob, factor); +} + +// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; +static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = { + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, + 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 +}; + +static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob, + const unsigned int ct[2]) { + const unsigned int den = ct[0] + ct[1]; + if (den == 0) { + return pre_prob; + } else { + const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT); + const unsigned int factor = count_to_update_factor[count]; + const aom_prob prob = get_prob(ct[0], den); + return weighted_prob(pre_prob, prob, factor); + } +} + +void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, + const unsigned int *counts, aom_prob *probs); + +#if CONFIG_EC_MULTISYMBOL +int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, + aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind, + int *pth, int *len); + +static INLINE void av1_tree_to_cdf(const aom_tree_index *tree, + const aom_prob *probs, aom_cdf_prob *cdf) { + aom_tree_index index[16]; + int path[16]; + int dist[16]; + tree_to_cdf(tree, probs, 0, cdf, index, path, dist); +} + +#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \ + do { \ + int i; \ + for (i = 0; i < u; i++) { \ + av1_tree_to_cdf(tree, probs[i], cdf[i]); \ + } \ + } while (0) + +#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u) \ + do { \ + int j; \ + int i; \ + for (j = 0; j < v; j++) { \ + for (i = 0; i < u; i++) { \ + av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \ + } \ + } \ + } while (0) + +void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree); +#endif + +DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]); + +#if CONFIG_EC_ADAPT +static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { + const int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); + const int rate2 = 5; + int i, tmp; + int diff; +#if 1 + const int tmp0 = 1 << rate2; + tmp = AOM_ICDF(tmp0); + diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate; +// Single loop (faster) +#if CONFIG_DAALA_EC && CONFIG_EC_SMALLMUL + for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) { + tmp -= (i == val ? diff : 0); + cdf[i] += ((tmp - cdf[i]) >> rate); + } +#else + for (i = 0; i < nsymbs - 1; ++i, tmp += tmp0) { + tmp += (i == val ? diff : 0); + cdf[i] -= ((cdf[i] - tmp) >> rate); + } +#endif +#else + for (i = 0; i < nsymbs; ++i) { + tmp = (i + 1) << rate2; + cdf[i] -= ((cdf[i] - tmp) >> rate); + } + diff = CDF_PROB_TOP - cdf[nsymbs - 1]; + + for (i = val; i < nsymbs; ++i) { + cdf[i] += diff; + } +#endif + cdf[nsymbs] += (cdf[nsymbs] < 32); +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_PROB_H_ diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c new file mode 100644 index 000000000..461c13729 --- /dev/null +++ b/third_party/aom/aom_dsp/psnr.c @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/psnr.h" +#include "aom_scale/yv12config.h" + +double aom_sse_to_psnr(double samples, double peak, double sse) { + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > MAX_PSNR ? MAX_PSNR : psnr; + } else { + return MAX_PSNR; + } +} + +/* TODO(yaowu): The block_variance calls the unoptimized versions of variance() +* and highbd_8_variance(). It should not. +*/ +static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, unsigned int *sse, + int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#if CONFIG_HIGHBITDEPTH +static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, uint64_t *sse, int64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, + &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} +#endif // CONFIG_HIGHBITDEPTH + +static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + const int dw = width % 16; + const int dh = height % 16; + int64_t total_sse = 0; + unsigned int sse = 0; + int sum = 0; + int x, y; + + if (dw > 0) { + encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, + height, &sse, &sum); + total_sse += sse; + } + + if (dh > 0) { + encoder_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh, + &sse, &sum); + total_sse += sse; + } + + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + aom_mse16x16(pa, a_stride, pb, b_stride, &sse); + total_sse += sse; + + pa += 16; + pb += 16; + } + + a += 16 * a_stride; + b += 16 * b_stride; + } + + return total_sse; +} + +#if CONFIG_HIGHBITDEPTH +static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height, unsigned int input_shift) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t total_sse = 0; + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + int64_t diff; + diff = (a[x] >> input_shift) - (b[x] >> input_shift); + total_sse += diff * diff; + } + a += a_stride; + b += b_stride; + } + return total_sse; +} + +static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int64_t total_sse = 0; + int x, y; + const int dw = width % 16; + const int dh = height % 16; + unsigned int sse = 0; + int sum = 0; + if (dw > 0) { + encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height, &sse, &sum); + total_sse += sse; + } + if (dh > 0) { + encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); + total_sse += sse; + } + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); + total_sse += sse; + pa += 16; + pb += 16; + } + a += 16 * a_stride; + b += 16 * b_stride; + } + return total_sse; +} +#endif // CONFIG_HIGHBITDEPTH + +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, + width, height); +} + +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + + return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +#if CONFIG_HIGHBITDEPTH +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse( + a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height); +} + +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_HIGHBITDEPTH +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + uint32_t bit_depth, uint32_t in_bit_depth) { + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + const double peak = (double)((1 << in_bit_depth) - 1); + const unsigned int input_shift = bit_depth - in_bit_depth; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + uint64_t sse; + if (a->flags & YV12_FLAG_HIGHBITDEPTH) { + if (input_shift) { + sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h, input_shift); + } else { + sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h); + } + } else { + sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + } + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} + +#endif // !CONFIG_HIGHBITDEPTH + +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr) { + static const double peak = 255.0; + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + const uint64_t sse = + get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h new file mode 100644 index 000000000..480140e6f --- /dev/null +++ b/third_party/aom/aom_dsp/psnr.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_PSNR_H_ +#define AOM_DSP_PSNR_H_ + +#include "aom_scale/yv12config.h" + +#define MAX_PSNR 100.0 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double psnr[4]; // total/y/u/v + uint64_t sse[4]; // total/y/u/v + uint32_t samples[4]; // total/y/u/v +} PSNR_STATS; + +/*!\brief Converts SSE to PSNR +* +* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). +* +* \param[in] samples Number of samples +* \param[in] peak Max sample value +* \param[in] sse Sum of squared errors +*/ +double aom_sse_to_psnr(double samples, double peak, double sse); +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +#if CONFIG_HIGHBITDEPTH +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + unsigned int bit_depth, unsigned int in_bit_depth); +#endif +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr); + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *phvs_y, + double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_DSP_PSNR_H_ diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c new file mode 100644 index 000000000..aeefd5908 --- /dev/null +++ b/third_party/aom/aom_dsp/psnrhvs.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Gregory Maxwell, at the Daala + * project. + */ + +#include +#include +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/psnr.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/system_state.h" + +#if !defined(M_PI) +#define M_PI (3.141592653589793238462643) +#endif +#include + +static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} + +#if CONFIG_HIGHBITDEPTH +static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_highbd_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} +#endif + +/* Normalized inverse quantization matrix for 8x8 DCT at the point of + * transparency. This is not the JPEG based matrix from the paper, + this one gives a slightly higher MOS agreement.*/ +static const double csf_y[8][8] = { + { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, + 0.678296995242, 0.466224900598, 0.3265091542 }, + { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, + 0.868920337363, 0.61280991668, 0.436405793551 }, + { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, + 0.670882927016, 0.501731932449, 0.372504254596 }, + { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, + 0.48309405692, 0.380429446972, 0.295774038565 }, + { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, + 0.352889268808, 0.283006984131, 0.226951348204 }, + { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, + 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, + { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, + 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, + { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, + 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } +}; +static const double csf_cb420[8][8] = { + { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, + 0.898018824055, 0.74725392039, 0.615105596242 }, + { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, + 1.17428548929, 0.996404342439, 0.830890433625 }, + { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, + 0.960060382087, 0.849823426169, 0.731221236837 }, + { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, + 0.751437590932, 0.685398513368, 0.608694761374 }, + { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, + 0.605503172737, 0.55002013668, 0.495804539034 }, + { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, + 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, + { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, + 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, + { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, + 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } +}; +static const double csf_cr420[8][8] = { + { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, + 0.867069376285, 0.721500455585, 0.593906509971 }, + { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, + 1.13381474809, 0.962064122248, 0.802254508198 }, + { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, + 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, + { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, + 0.725539939514, 0.661776842059, 0.587716619023 }, + { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, + 0.584635025748, 0.531064164893, 0.478717061273 }, + { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, + 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, + { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, + 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, + { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, + 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } +}; + +static double convert_score_db(double _score, double _weight, int bit_depth) { + int16_t pix_max = 255; + assert(_score * _weight >= 0.0); + if (bit_depth == 10) + pix_max = 1023; + else if (bit_depth == 12) + pix_max = 4095; + + if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; + return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); +} + +static double calc_psnrhvs(const unsigned char *src, int _systride, + const unsigned char *dst, int _dystride, double _par, + int _w, int _h, int _step, const double _csf[8][8], + uint32_t bit_depth, uint32_t _shift) { + double ret; + const uint8_t *_src8 = src; + const uint8_t *_dst8 = dst; + const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); + int16_t dct_s[8 * 8], dct_d[8 * 8]; + tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8]; + double mask[8][8]; + int pixels; + int x; + int y; + (void)_par; + ret = pixels = 0; + /*In the PSNR-HVS-M paper[1] the authors describe the construction of + their masking table as "we have used the quantization table for the + color component Y of JPEG [6] that has been also obtained on the + basis of CSF. Note that the values in quantization table JPEG have + been normalized and then squared." Their CSF matrix (from PSNR-HVS) + was also constructed from the JPEG matrices. I can not find any obvious + scheme of normalizing to produce their table, but if I multiply their + CSF by 0.38857 and square the result I get their masking table. + I have no idea where this constant comes from, but deviating from it + too greatly hurts MOS agreement. + + [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli, + Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking + of DCT basis functions", CD-ROM Proceedings of the Third + International Workshop on Video Processing and Quality Metrics for Consumer + Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/ + for (x = 0; x < 8; x++) + for (y = 0; y < 8; y++) + mask[x][y] = + (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003); + for (y = 0; y < _h - 7; y += _step) { + for (x = 0; x < _w - 7; x += _step) { + int i; + int j; + double s_means[4]; + double d_means[4]; + double s_vars[4]; + double d_vars[4]; + double s_gmean = 0; + double d_gmean = 0; + double s_gvar = 0; + double d_gvar = 0; + double s_mask = 0; + double d_mask = 0; + for (i = 0; i < 4; i++) + s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + int sub = ((i & 12) >> 2) + ((j & 12) >> 1); + if (bit_depth == 8 && _shift == 0) { + dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; + dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; + } else if (bit_depth == 10 || bit_depth == 12) { + dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; + dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; + } + s_gmean += dct_s[i * 8 + j]; + d_gmean += dct_d[i * 8 + j]; + s_means[sub] += dct_s[i * 8 + j]; + d_means[sub] += dct_d[i * 8 + j]; + } + } + s_gmean /= 64.f; + d_gmean /= 64.f; + for (i = 0; i < 4; i++) s_means[i] /= 16.f; + for (i = 0; i < 4; i++) d_means[i] /= 16.f; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + int sub = ((i & 12) >> 2) + ((j & 12) >> 1); + s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean); + d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean); + s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) * + (dct_s[i * 8 + j] - s_means[sub]); + d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) * + (dct_d[i * 8 + j] - d_means[sub]); + } + } + s_gvar *= 1 / 63.f * 64; + d_gvar *= 1 / 63.f * 64; + for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16; + for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16; + if (s_gvar > 0) + s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar; + if (d_gvar > 0) + d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar; +#if CONFIG_HIGHBITDEPTH + if (bit_depth == 10 || bit_depth == 12) { + hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } +#endif + if (bit_depth == 8) { + od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j]; + s_mask = sqrt(s_mask * s_gvar) / 32.f; + d_mask = sqrt(d_mask * d_gvar) / 32.f; + if (d_mask > s_mask) s_mask = d_mask; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + double err; + err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j])); + if (i != 0 || j != 0) + err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j]; + ret += (err * _csf[i][j]) * (err * _csf[i][j]); + pixels++; + } + } + } + } + if (pixels <= 0) return 0; + ret /= pixels; + return ret; +} + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, + double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs, + uint32_t bd, uint32_t in_bd) { + double psnrhvs; + const double par = 1.0; + const int step = 7; + uint32_t bd_shift = 0; + aom_clear_system_state(); + + assert(bd == 8 || bd == 10 || bd == 12); + assert(bd >= in_bd); + + bd_shift = bd - in_bd; + + *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, par, src->y_crop_width, + src->y_crop_height, step, csf_y, bd, bd_shift); + *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, par, src->uv_crop_width, + src->uv_crop_height, step, csf_cb420, bd, bd_shift); + *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, par, src->uv_crop_width, + src->uv_crop_height, step, csf_cr420, bd, bd_shift); + psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); + return convert_score_db(psnrhvs, 1.0, in_bd); +} diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c new file mode 100644 index 000000000..0759c22e3 --- /dev/null +++ b/third_party/aom/aom_dsp/quantize.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" + +static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, +#endif + const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int i, non_zero_count = (int)n_coeffs, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; +#if CONFIG_AOM_QM + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; +#else + const int coeff = coeff_ptr[rc]; +#endif // CONFIG_AOM_QM + +#if CONFIG_AOM_QM + if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && + coeff > (nzbins[rc != 0] << AOM_QM_BITS)) + non_zero_count--; +#else + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) non_zero_count--; +#endif // CONFIG_AOM_QM + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + +#if CONFIG_AOM_QM + const qm_val_t wt = qm_ptr[rc]; + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { +#else + if (abs_coeff >= zbins[rc != 0]) { +#endif // CONFIG_AOM_QM + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); +#if CONFIG_AOM_QM + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization +#else + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale)); // quantization +#endif // CONFIG_AOM_QM + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; +#if CONFIG_AOM_QM + const int dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale); +#else + dqcoeff_ptr[rc] = + qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale); +#endif // CONFIG_AOM_QM + + if (tmp32) eob = i; + } + } + } + *eob_ptr = eob + 1; +} + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 0); +} + +void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 1); +} + +#if CONFIG_TX64X64 +void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 2); +} +#endif // CONFIG_TX64X64 + +#if CONFIG_AOM_QM +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp, eob = -1; + int32_t tmp32; + int dequant = + (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp, eob = -1; + int32_t tmp32; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = + (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp, eob = -1; + int32_t tmp32; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = + (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + int eob = -1; + int dequant = + (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[0]; + const uint32_t abs_qcoeff = + (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + int eob = -1; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); + const uint32_t abs_qcoeff = + (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + int eob = -1; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); + const uint32_t abs_qcoeff = + (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int dequant; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; + + if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && + coeff > (nzbins[rc != 0] << AOM_QM_BITS)) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; + if (abs_qcoeff) eob = i; + } + } + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + int dequant; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || + coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const qm_val_t wt = qm_ptr[rc]; + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_b_64x64_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), + ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[4096]; + int i, eob = -1; + int dequant; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || + coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const qm_val_t wt = qm_ptr[rc]; + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH + +#else // CONFIG_AOM_QM + +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + if (tmp) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int n_coeffs = 1024; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + if (tmp) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int n_coeffs = 4096; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), + INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 14; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4; + if (tmp) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[0]; + const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, + uint16_t *eob_ptr) { + const int n_coeffs = 1024; + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); + const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, + uint16_t *eob_ptr) { + const int n_coeffs = 4096; + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); + const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; + } + } + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_b_64x64_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), + ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[4096]; + int i, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AOM_QM diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h new file mode 100644 index 000000000..fe49b830f --- /dev/null +++ b/third_party/aom/aom_dsp/quantize.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_QUANTIZE_H_ +#define AOM_DSP_QUANTIZE_H_ + +#include "./aom_config.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_AOM_QM +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +#endif // CONFIG_TX64X64 +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +void aom_highbd_quantize_dc_32x32( + const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64( + const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +#endif // CONFIG_TX64X64 +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +#endif // CONFIG_HIGHBITDEPTH + +#else // CONFIG_AOM_QM + +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#endif // CONFIG_TX64X64 +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr); +void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, + const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, + const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AOM_QM + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_QUANTIZE_H_ diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c new file mode 100644 index 000000000..3e1070519 --- /dev/null +++ b/third_party/aom/aom_dsp/sad.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +/* Sum the difference between every corresponding element of the buffers. */ +static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define sadMxN(m, n) \ + unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } + +// depending on call sites, pass **ref_array to avoid & in subsequent call and +// de-dup with 4D below. +#define sadMxNxK(m, n, k) \ + void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = \ + aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ + } + +// This appears to be equivalent to the above when k == 4 and refs is const +#define sadMxNx4D(m, n) \ + void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ + } + +/* clang-format off */ +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +// 128x128 +sadMxN(128, 128) +sadMxNxK(128, 128, 3) +sadMxNxK(128, 128, 8) +sadMxNx4D(128, 128) + +// 128x64 +sadMxN(128, 64) +sadMxNx4D(128, 64) + +// 64x128 +sadMxN(64, 128) +sadMxNx4D(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION + +// 64x64 +sadMxN(64, 64) +sadMxNxK(64, 64, 3) +sadMxNxK(64, 64, 8) +sadMxNx4D(64, 64) + +// 64x32 +sadMxN(64, 32) +sadMxNx4D(64, 32) + +// 32x64 +sadMxN(32, 64) +sadMxNx4D(32, 64) + +// 32x32 +sadMxN(32, 32) +sadMxNxK(32, 32, 3) +sadMxNxK(32, 32, 8) +sadMxNx4D(32, 32) + +// 32x16 +sadMxN(32, 16) +sadMxNx4D(32, 16) + +// 16x32 +sadMxN(16, 32) +sadMxNx4D(16, 32) + +// 16x16 +sadMxN(16, 16) +sadMxNxK(16, 16, 3) +sadMxNxK(16, 16, 8) +sadMxNx4D(16, 16) + +// 16x8 +sadMxN(16, 8) +sadMxNxK(16, 8, 3) +sadMxNxK(16, 8, 8) +sadMxNx4D(16, 8) + +// 8x16 +sadMxN(8, 16) +sadMxNxK(8, 16, 3) +sadMxNxK(8, 16, 8) +sadMxNx4D(8, 16) + +// 8x8 +sadMxN(8, 8) +sadMxNxK(8, 8, 3) +sadMxNxK(8, 8, 8) +sadMxNx4D(8, 8) + +// 8x4 +sadMxN(8, 4) +sadMxNxK(8, 4, 8) +sadMxNx4D(8, 4) + +// 4x8 +sadMxN(4, 8) +sadMxNxK(4, 8, 8) +sadMxNx4D(4, 8) + +// 4x4 +sadMxN(4, 4) +sadMxNxK(4, 4, 3) +sadMxNxK(4, 4, 8) +sadMxNx4D(4, 4) +/* clang-format on */ + +#if CONFIG_HIGHBITDEPTH + static INLINE + unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, + const uint16_t *b, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define highbd_sadMxN(m, n) \ + unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_highbd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint16_t comp_pred[m * n]; \ + aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ + return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ + } + +#define highbd_sadMxNxK(m, n, k) \ + void aom_highbd_sad##m##x##n##x##k##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref_array, \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) { \ + sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ + &ref_array[i], ref_stride); \ + } \ + } + +#define highbd_sadMxNx4D(m, n) \ + void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } + +/* clang-format off */ +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +// 128x128 +highbd_sadMxN(128, 128) +highbd_sadMxNxK(128, 128, 3) +highbd_sadMxNxK(128, 128, 8) +highbd_sadMxNx4D(128, 128) + +// 128x64 +highbd_sadMxN(128, 64) +highbd_sadMxNx4D(128, 64) + +// 64x128 +highbd_sadMxN(64, 128) +highbd_sadMxNx4D(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION + +// 64x64 +highbd_sadMxN(64, 64) +highbd_sadMxNxK(64, 64, 3) +highbd_sadMxNxK(64, 64, 8) +highbd_sadMxNx4D(64, 64) + +// 64x32 +highbd_sadMxN(64, 32) +highbd_sadMxNx4D(64, 32) + +// 32x64 +highbd_sadMxN(32, 64) +highbd_sadMxNx4D(32, 64) + +// 32x32 +highbd_sadMxN(32, 32) +highbd_sadMxNxK(32, 32, 3) +highbd_sadMxNxK(32, 32, 8) +highbd_sadMxNx4D(32, 32) + +// 32x16 +highbd_sadMxN(32, 16) +highbd_sadMxNx4D(32, 16) + +// 16x32 +highbd_sadMxN(16, 32) +highbd_sadMxNx4D(16, 32) + +// 16x16 +highbd_sadMxN(16, 16) +highbd_sadMxNxK(16, 16, 3) +highbd_sadMxNxK(16, 16, 8) +highbd_sadMxNx4D(16, 16) + +// 16x8 +highbd_sadMxN(16, 8) +highbd_sadMxNxK(16, 8, 3) +highbd_sadMxNxK(16, 8, 8) +highbd_sadMxNx4D(16, 8) + +// 8x16 +highbd_sadMxN(8, 16) +highbd_sadMxNxK(8, 16, 3) +highbd_sadMxNxK(8, 16, 8) +highbd_sadMxNx4D(8, 16) + +// 8x8 +highbd_sadMxN(8, 8) +highbd_sadMxNxK(8, 8, 3) +highbd_sadMxNxK(8, 8, 8) +highbd_sadMxNx4D(8, 8) + +// 8x4 +highbd_sadMxN(8, 4) +highbd_sadMxNxK(8, 4, 8) +highbd_sadMxNx4D(8, 4) + +// 4x8 +highbd_sadMxN(4, 8) +highbd_sadMxNxK(4, 8, 8) +highbd_sadMxNx4D(4, 8) + +// 4x4 +highbd_sadMxN(4, 4) +highbd_sadMxNxK(4, 4, 3) +highbd_sadMxNxK(4, 4, 8) +highbd_sadMxNx4D(4, 4) +/* clang-format on */ +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_AV1 && CONFIG_EXT_INTER + static INLINE + unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const uint8_t *m, int m_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + m += m_stride; + } + sad = (sad + 31) >> 6; + + return sad; +} + +#define MASKSADMxN(m, n) \ + unsigned int aom_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, \ + n); \ + } + +/* clang-format off */ +#if CONFIG_EXT_PARTITION +MASKSADMxN(128, 128) +MASKSADMxN(128, 64) +MASKSADMxN(64, 128) +#endif // CONFIG_EXT_PARTITION +MASKSADMxN(64, 64) +MASKSADMxN(64, 32) +MASKSADMxN(32, 64) +MASKSADMxN(32, 32) +MASKSADMxN(32, 16) +MASKSADMxN(16, 32) +MASKSADMxN(16, 16) +MASKSADMxN(16, 8) +MASKSADMxN(8, 16) +MASKSADMxN(8, 8) +MASKSADMxN(8, 4) +MASKSADMxN(4, 8) +MASKSADMxN(4, 4) +/* clang-format on */ + +#if CONFIG_HIGHBITDEPTH + static INLINE + unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + m += m_stride; + } + sad = (sad + 31) >> 6; + + return sad; +} + +#define HIGHBD_MASKSADMXN(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } + +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN(128, 128) +HIGHBD_MASKSADMXN(128, 64) +HIGHBD_MASKSADMXN(64, 128) +#endif // CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN(64, 64) +HIGHBD_MASKSADMXN(64, 32) +HIGHBD_MASKSADMXN(32, 64) +HIGHBD_MASKSADMXN(32, 32) +HIGHBD_MASKSADMXN(32, 16) +HIGHBD_MASKSADMXN(16, 32) +HIGHBD_MASKSADMXN(16, 16) +HIGHBD_MASKSADMXN(16, 8) +HIGHBD_MASKSADMXN(8, 16) +HIGHBD_MASKSADMXN(8, 8) +HIGHBD_MASKSADMXN(8, 4) +HIGHBD_MASKSADMXN(4, 8) +HIGHBD_MASKSADMXN(4, 4) +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_EXT_INTER + +#if CONFIG_AV1 && CONFIG_MOTION_VAR +// pre: predictor being evaluated +// wsrc: target weighted prediction (has been *4096 to keep precision) +// mask: 2d weights (scaled by 4096) +static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define OBMCSADMxN(m, n) \ + unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *mask) { \ + return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +#if CONFIG_EXT_PARTITION +OBMCSADMxN(128, 128) +OBMCSADMxN(128, 64) +OBMCSADMxN(64, 128) +#endif // CONFIG_EXT_PARTITION +OBMCSADMxN(64, 64) +OBMCSADMxN(64, 32) +OBMCSADMxN(32, 64) +OBMCSADMxN(32, 32) +OBMCSADMxN(32, 16) +OBMCSADMxN(16, 32) +OBMCSADMxN(16, 16) +OBMCSADMxN(16, 8) +OBMCSADMxN(8, 16) +OBMCSADMxN(8, 8) +OBMCSADMxN(8, 4) +OBMCSADMxN(4, 8) +OBMCSADMxN(4, 4) +/* clang-format on */ + +#if CONFIG_HIGHBITDEPTH + static INLINE + unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define HIGHBD_OBMCSADMXN(m, n) \ + unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +#if CONFIG_EXT_PARTITION +HIGHBD_OBMCSADMXN(128, 128) +HIGHBD_OBMCSADMXN(128, 64) +HIGHBD_OBMCSADMXN(64, 128) +#endif // CONFIG_EXT_PARTITION +HIGHBD_OBMCSADMXN(64, 64) +HIGHBD_OBMCSADMXN(64, 32) +HIGHBD_OBMCSADMXN(32, 64) +HIGHBD_OBMCSADMXN(32, 32) +HIGHBD_OBMCSADMXN(32, 16) +HIGHBD_OBMCSADMXN(16, 32) +HIGHBD_OBMCSADMXN(16, 16) +HIGHBD_OBMCSADMXN(16, 8) +HIGHBD_OBMCSADMXN(8, 16) +HIGHBD_OBMCSADMXN(8, 8) +HIGHBD_OBMCSADMXN(8, 4) +HIGHBD_OBMCSADMXN(4, 8) +HIGHBD_OBMCSADMXN(4, 4) +/* clang-format on */ +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h new file mode 100644 index 000000000..8f6509383 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_H +#define _V128_INTRINSICS_H + +#include +#include +#include +#include "./v128_intrinsics_c.h" +#include "./v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v128 v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); } +SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); } +SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); } +SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) { + return c_v128_from_64(hi, lo); +} +SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) { + return c_v128_from_v64(hi, lo); +} +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return c_v128_from_32(a, b, c, d); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { + return c_v128_load_unaligned(p); +} +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return c_v128_load_aligned(p); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + c_v128_store_unaligned(p, a); +} +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + c_v128_store_aligned(p, a); +} + +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { + return c_v128_align(a, b, c); +} + +SIMD_INLINE v128 v128_zero() { return c_v128_zero(); } +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); } +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); } +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); } + +typedef uint32_t sad128_internal; +SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); } +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + return c_v128_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return c_v128_sad_u8_sum(s); +} +typedef uint32_t ssd128_internal; +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); } +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + return c_v128_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { + return c_v128_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + return c_v128_dotp_s16(a, b); +} +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); } + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); } +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); } +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); } +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); } + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); } +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); } +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); } +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); } +SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); } +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); } +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); } +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); } +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); } +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); } +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); } +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); } +SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); } +SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); } +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return c_v128_mullo_s16(a, b); +} +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return c_v128_mulhi_s16(a, b); +} +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { + return c_v128_mullo_s32(a, b); +} +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); } +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); } + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); } +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); } +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); } +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); } +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); } +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); } +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); } +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); } +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); } + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); } +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); } +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); } +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); } +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); } +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); } +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); } +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); } +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); } +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); } +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); } +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { + return c_v128_unziplo_8(a, b); +} +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return c_v128_unziphi_8(a, b); +} +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { + return c_v128_unziplo_16(a, b); +} +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return c_v128_unziphi_16(a, b); +} +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return c_v128_unziplo_32(a, b); +} +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return c_v128_unziphi_32(a, b); +} +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return c_v128_unpacklo_u8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return c_v128_unpackhi_u8_s16(a); +} +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return c_v128_unpacklo_s8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return c_v128_unpackhi_s8_s16(a); +} +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return c_v128_pack_s32_s16(a, b); +} +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return c_v128_pack_s16_u8(a, b); +} +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return c_v128_pack_s16_s8(a, b); +} +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); } +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); } +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return c_v128_unpacklo_u16_s32(a); +} +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return c_v128_unpacklo_s16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return c_v128_unpackhi_u16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return c_v128_unpackhi_s16_s32(a); +} +SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) { + return c_v128_shuffle_8(a, pattern); +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); } +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); } +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); } +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return c_v128_cmpgt_s16(a, b); +} +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return c_v128_cmplt_s16(a, b); +} +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return c_v128_shl_8(a, c); +} +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return c_v128_shr_u8(a, c); +} +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + return c_v128_shr_s8(a, c); +} +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return c_v128_shl_16(a, c); +} +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return c_v128_shr_u16(a, c); +} +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return c_v128_shr_s16(a, c); +} +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return c_v128_shl_32(a, c); +} +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return c_v128_shr_u32(a, c); +} +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return c_v128_shr_s32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + return c_v128_shr_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + return c_v128_shl_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) { + return c_v128_shl_n_8(a, n); +} +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) { + return c_v128_shl_n_16(a, n); +} +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) { + return c_v128_shl_n_32(a, n); +} +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) { + return c_v128_shr_n_u8(a, n); +} +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) { + return c_v128_shr_n_u16(a, n); +} +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) { + return c_v128_shr_n_u32(a, n); +} +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) { + return c_v128_shr_n_s8(a, n); +} +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) { + return c_v128_shr_n_s16(a, n); +} +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) { + return c_v128_shr_n_s32(a, n); +} + +#endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h new file mode 100644 index 000000000..0377d4ce1 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h @@ -0,0 +1,671 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_H +#define _V128_INTRINSICS_H + +#include +#include "./v64_intrinsics_arm.h" + +typedef int64x2_t v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { + return v64_low_u32(vget_low_s64(a)); +} + +SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); } + +SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } + +SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } + +SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { + return vcombine_s64((uint64x1_t)b, (uint64x1_t)a); +} + +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b)); +} + +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p)); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { + return v128_load_aligned(p); +} + +SIMD_INLINE void v128_store_aligned(void *p, v128 r) { + vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 r) { + vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); +} + +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + return c ? vreinterpretq_s64_s8( + vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c)) + : b; +#else + return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c), + v64_align(v128_high_v64(b), v128_low_v64(b), c)) + : v128_from_v64( + v64_align(v128_high_v64(a), v128_low_v64(a), c - 8), + v64_align(v128_low_v64(a), v128_high_v64(b), c - 8)); +#endif +} + +SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); } + +SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); } + +SIMD_INLINE v128 v128_dup_8(uint8_t x) { + return vreinterpretq_s64_u8(vdupq_n_u8(x)); +} + +SIMD_INLINE v128 v128_dup_16(uint16_t x) { + return vreinterpretq_s64_u16(vdupq_n_u16(x)); +} + +SIMD_INLINE v128 v128_dup_32(uint32_t x) { + return vreinterpretq_s64_u32(vdupq_n_u32(x)); +} + +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) + + v64_dotp_s16(vget_low_s64(a), vget_low_s64(b)); +} + +SIMD_INLINE uint64_t v128_hadd_u8(v128 x) { + uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x)))); + return vget_lane_s32( + vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); +} + +SIMD_INLINE v128 v128_padd_s16(v128 a) { + return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a))); +} + +typedef struct { sad64_internal hi, lo; } sad128_internal; + +SIMD_INLINE sad128_internal v128_sad_u8_init() { + sad128_internal s; + s.hi = s.lo = vdupq_n_u16(0); + return s; +} + +/* Implementation dependent return value. Result must be finalised with + v128_sad_u8_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + sad128_internal r; + r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); + r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); + return r; +} + +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo)); +} + +typedef struct { ssd64_internal hi, lo; } ssd128_internal; + +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { + ssd128_internal s; + s.hi = s.lo = (ssd64_internal)(uint64_t)0; + return s; +} + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_u8_sum(). */ +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + ssd128_internal r; + r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); + r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); + return r; +} + +SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { + return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo)); +} + +SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); } + +SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); } + +SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); } + +SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); } + +SIMD_INLINE v128 v128_add_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_add_16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_add_32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y))); +} + +SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) { + return vreinterpretq_s64_s32( + vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_abs_s16(v128 x) { + return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x))); +} + +SIMD_INLINE v128 v128_abs_s8(v128 x) { + return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x))); +} + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { + return vreinterpretq_s64_s32( + vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b))); +} + +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return vreinterpretq_s64_s16( + vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); +} + +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)), + v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b))); +} + +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { + return vreinterpretq_s64_s32( + vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} + +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { + return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)), + v64_madd_s16(vget_low_s64(a), vget_low_s64(b))); +} + +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { + return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)), + v64_madd_us8(vget_low_s64(a), vget_low_s64(b))); +} + +SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + +SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) { + uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[0]); +} + +SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) { + uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[1]); +} + +SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) { + int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); + return vreinterpretq_s64_s16(r.val[0]); +} + +SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) { + int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); + return vreinterpretq_s64_s16(r.val[1]); +} + +SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { + uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) { + int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); + return vreinterpretq_s64_s32(r.val[0]); +} + +SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) { + int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); + return vreinterpretq_s64_s32(r.val[1]); +} + +SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { + uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)); + return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { + return v128_from_v64(vget_low_u64((uint64x2_t)a), + vget_low_u64((uint64x2_t)b)); +} + +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { + return v128_from_v64(vget_high_u64((uint64x2_t)a), + vget_high_u64((uint64x2_t)b)); +} + +SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) { + uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[0]); +} + +SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) { + uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[1]); +} + +SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) { + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); + return vreinterpretq_s64_u16(r.val[0]); +} + +SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) { + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); + return vreinterpretq_s64_u16(r.val[1]); +} + +SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) { + uint32x4x2_t r = + vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); + return vreinterpretq_s64_u32(r.val[0]); +} + +SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) { + uint32x4x2_t r = + vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); + return vreinterpretq_s64_u32(r.val[1]); +} + +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))), + vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b)))); +} + +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))), + vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b)))); +} + +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))), + vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b)))); +} + +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { + return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a))); +} + +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { + return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return vreinterpretq_s64_u32( + vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return vreinterpretq_s64_s32( + vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return vreinterpretq_s64_u32( + vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return vreinterpretq_s64_s32( + vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { + return v128_from_64( + (uint64_t)vreinterpret_s64_u8( + vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), + vget_high_u8(vreinterpretq_u8_s64(x)) } }, + vreinterpret_u8_s64(vget_high_s64(pattern)))), + (uint64_t)vreinterpret_s64_u8( + vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), + vget_high_u8(vreinterpretq_u8_s64(x)) } }, + vreinterpret_u8_s64(vget_low_s64(pattern))))); +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8( + vreinterpretq_u8_s64(a), vdupq_n_s8(c))); +} + +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8( + vreinterpretq_u8_s64(a), vdupq_n_s8(-c))); +} + +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8( + vreinterpretq_s8_s64(a), vdupq_n_s8(-c))); +} + +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return (c > 15) ? v128_zero() + : vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c))); +} + +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return (c > 15) ? v128_zero() + : vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c))); +} + +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return (c > 15) ? v128_ones() + : vreinterpretq_s64_s16( + vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c))); +} + +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return (c > 31) ? v128_zero() + : vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c))); +} + +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return (c > 31) ? v128_zero() + : vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c))); +} + +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return (c > 31) ? v128_ones() + : vreinterpretq_s64_s32( + vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c))); +} + +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + return n < 8 + ? v128_from_64( + (uint64_t)vorr_u64( + vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + n * 8), + vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + (8 - n) * 8)), + (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + n * 8)) + : (n == 8 ? v128_from_64( + (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0) + : v128_from_64((uint64_t)vshl_n_u64( + vreinterpret_u64_s64(vget_low_s64(a)), + (n - 8) * 8), + 0)); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + return n < 8 + ? v128_from_64( + vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8), + vorr_u64( + vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8), + vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + (8 - n) * 8))) + : (n == 8 + ? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a))) + : v128_from_64( + 0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + (n - 8) * 8))); +} + +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { + return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { + return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { + return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)); +} + +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { + return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { + return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { + return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c)); +} + +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { + return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { + return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { + return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c)); +} + +#else + +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + if (n < 8) + return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n), + v64_shr_n_byte(v128_low_v64(a), 8 - n)), + v64_shl_n_byte(v128_low_v64(a), n)); + else + return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero()); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + if (n < 8) + return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n), + v64_or(v64_shr_n_byte(v128_low_v64(a), n), + v64_shl_n_byte(v128_high_v64(a), 8 - n))); + else + return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8)); +} + +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { + return v128_shl_8(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { + return v128_shr_u8(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { + return v128_shr_s8(a, c); +} + +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { + return v128_shl_16(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { + return v128_shr_u16(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { + return v128_shr_s16(a, c); +} + +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { + return v128_shl_32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { + return v128_shr_u32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { + return v128_shr_s32(a, c); +} + +#endif + +#endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h new file mode 100644 index 000000000..32e7c32de --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h @@ -0,0 +1,707 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_C_H +#define _V128_INTRINSICS_C_H + +#include +#include +#include "./v64_intrinsics_c.h" +#include "./aom_config.h" + +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t s8[16]; + int16_t s16[8]; + int32_t s32[4]; + int64_t s64[2]; + c_v64 v64[2]; +} c_v128; + +SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; } + +SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; } + +SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) { + c_v128 t; + t.u64[1] = hi; + t.u64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) { + c_v128 t; + t.v64[1] = hi; + t.v64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c, + uint32_t d) { + c_v128 t; + t.u32[3] = a; + t.u32[2] = b; + t.u32[1] = c; + t.u32[0] = d; + return t; +} + +SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) { + c_v128 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 16; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 load at %p\n", p); + abort(); + } + return c_v128_load_unaligned(p); +} + +SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 16; c++) pp[c] = q[c]; +} + +SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 store at %p\n", p); + abort(); + } + c_v128_store_unaligned(p, a); +} + +SIMD_INLINE c_v128 c_v128_zero() { + c_v128 t; + t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { + return c_v64_dotp_s16(a.v64[1], b.v64[1]) + + c_v64_dotp_s16(a.v64[0], b.v64[0]); +} + +SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { + return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); +} + +typedef uint32_t c_sad128_internal; + +SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + v128_sad_u8_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) + s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + return s; +} + +SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; } + +typedef uint32_t c_ssd128_internal; + +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_u8_sum(). */ +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; } + +SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]), + c_v64_or(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]), + c_v64_xor(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]), + c_v64_and(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]), + c_v64_andn(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]), + c_v64_add_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]), + c_v64_add_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), + c_v64_sadd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]), + c_v64_add_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { + c_v128 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + return t; +} + +SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), + c_v64_sub_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]), + c_v64_ssub_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]), + c_v64_ssub_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]), + c_v64_sub_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]), + c_v64_ssub_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]), + c_v64_ssub_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]), + c_v64_sub_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) { + c_v64 lo_bits = c_v64_mullo_s16(a, b); + c_v64 hi_bits = c_v64_mulhi_s16(a, b); + return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits), + c_v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]), + c_v64_mullo_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]), + c_v64_mulhi_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]), + c_v64_mullo_s32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]), + c_v64_madd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]), + c_v64_madd_us8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]), + c_v64_avg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]), + c_v64_rdavg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), + c_v64_avg_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]), + c_v64_min_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]), + c_v64_max_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]), + c_v64_min_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), + c_v64_max_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]), + c_v64_min_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]), + c_v64_max_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), + c_v64_ziplo_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]), + c_v64_ziplo_8(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]), + c_v64_ziplo_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]), + c_v64_ziplo_16(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]), + c_v64_ziplo_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]), + c_v64_ziplo_32(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[0], b.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[1], b.v64[1]); +} + +SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b)); +} + +SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u8[15] = b.u8[15]; + t.u8[14] = b.u8[13]; + t.u8[13] = b.u8[11]; + t.u8[12] = b.u8[9]; + t.u8[11] = b.u8[7]; + t.u8[10] = b.u8[5]; + t.u8[9] = b.u8[3]; + t.u8[8] = b.u8[1]; + t.u8[7] = a.u8[15]; + t.u8[6] = a.u8[13]; + t.u8[5] = a.u8[11]; + t.u8[4] = a.u8[9]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[15] = a.u8[14]; + t.u8[14] = a.u8[12]; + t.u8[13] = a.u8[10]; + t.u8[12] = a.u8[8]; + t.u8[11] = a.u8[6]; + t.u8[10] = a.u8[4]; + t.u8[9] = a.u8[2]; + t.u8[8] = a.u8[0]; + t.u8[7] = b.u8[14]; + t.u8[6] = b.u8[12]; + t.u8[5] = b.u8[10]; + t.u8[4] = b.u8[8]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1) + : _c_v128_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0) + : _c_v128_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u16[7] = b.u16[7]; + t.u16[6] = b.u16[5]; + t.u16[5] = b.u16[3]; + t.u16[4] = b.u16[1]; + t.u16[3] = a.u16[7]; + t.u16[2] = a.u16[5]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[7] = a.u16[6]; + t.u16[6] = a.u16[4]; + t.u16[5] = a.u16[2]; + t.u16[4] = a.u16[0]; + t.u16[3] = b.u16[6]; + t.u16[2] = b.u16[4]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1) + : _c_v128_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0) + : _c_v128_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u32[3] = b.u32[3]; + t.u32[2] = b.u32[1]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[3] = a.u32[2]; + t.u32[2] = a.u32[0]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1) + : _c_v128_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0) + : _c_v128_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]), + c_v64_unpacklo_u8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]), + c_v64_unpacklo_u8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]), + c_v64_unpacklo_s8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]), + c_v64_unpacklo_s8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]), + c_v64_pack_s32_s16(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), + c_v64_pack_s16_u8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]), + c_v64_pack_s16_s8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]), + c_v64_unpacklo_u16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]), + c_v64_unpacklo_s16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]), + c_v64_unpacklo_u16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]), + c_v64_unpacklo_s16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { + c_v128 t; + int c; + for (c = 0; c < 16; c++) { + if (pattern.u8[c] & ~15) { + fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c], + c); + abort(); + } + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) + : pattern.u8[c] & 15]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]), + c_v64_cmpgt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]), + c_v64_cmplt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]), + c_v64_cmpeq_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]), + c_v64_cmpgt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]), + c_v64_cmplt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]), + c_v64_cmpeq_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) { + if (n < 8) + return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), + c_v64_shr_n_byte(a.v64[0], 8 - n)), + c_v64_shl_n_byte(a.v64[0], n)); + else + return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); +} + +SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) { + if (n < 8) + return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), + c_v64_or(c_v64_shr_n_byte(a.v64[0], n), + c_v64_shl_n_byte(a.v64[1], 8 - n))); + else + return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); +} + +SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) { + if (SIMD_CHECK && c > 15) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c)) + : b; +} + +SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), + c_v64_shr_u16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), + c_v64_shr_s16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), + c_v64_shr_u32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), + c_v64_shr_s32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, unsigned int n) { + return c_v128_shl_8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, unsigned int n) { + return c_v128_shl_16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, unsigned int n) { + return c_v128_shl_32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, unsigned int n) { + return c_v128_shr_u8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, unsigned int n) { + return c_v128_shr_u16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, unsigned int n) { + return c_v128_shr_u32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, unsigned int n) { + return c_v128_shr_s8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, unsigned int n) { + return c_v128_shr_s16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, unsigned int n) { + return c_v128_shr_s32(a, n); +} + +#endif /* _V128_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h new file mode 100644 index 000000000..cca1788d5 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_H +#define _V128_INTRINSICS_H + +#include "./v64_intrinsics_x86.h" + +typedef __m128i v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE v64 v128_low_v64(v128 a) { + return _mm_unpacklo_epi64(a, v64_zero()); +} + +SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } + +SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { + return v128_from_v64(v64_from_64(a), v64_from_64(b)); +} + +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_set_epi32(a, b, c, d); +} + +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return _mm_load_si128((__m128i *)p); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { +#if defined(__SSSE3__) + return (__m128i)_mm_lddqu_si128((__m128i *)p); +#else + return _mm_loadu_si128((__m128i *)p); +#endif +} + +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + _mm_store_si128((__m128i *)p, a); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + _mm_storeu_si128((__m128i *)p, a); +} + +// The following function requires an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) +#if defined(__SSSE3__) +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { + return c ? _mm_alignr_epi8(a, b, c) : b; +} +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#else +#if defined(__SSSE3__) +#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b)) +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#endif + +SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); } + +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); } + +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); } + +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); } + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v128 v128_padd_s16(v128 a) { + return _mm_madd_epi16(a, _mm_set1_epi16(1)); +} + +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v128 v128_abs_s16(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v128 v128_abs_s8(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { + return _mm_unpacklo_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { + return _mm_unpackhi_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { + return _mm_unpacklo_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { + return _mm_unpackhi_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { + return _mm_unpacklo_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { + return _mm_unpackhi_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { + return _mm_unpackhi_epi64(b, a); +} + +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); +} + +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); +} + +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); +} + +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); +} + +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return _mm_unpackhi_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return _mm_packs_epi32(b, a); +} + +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return _mm_packus_epi16(b, a); +} + +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return _mm_packs_epi16(b, a); +} + +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return _mm_unpackhi_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v128 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + char *selected = (char *)&output; + int counter; + + for (counter = 0; counter < 16; counter++) { + selected[counter] = input[index[counter] & 15]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + v128 r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), + _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); + return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(r) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); +#endif +} + +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { + v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); + return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); +} + +typedef v128 sad128_internal; + +SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + v128_sad_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); +} + +typedef v128 ssd128_internal; + +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_sum(). */ +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_unpacklo_epi8(b, _mm_setzero_si128())); + v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), + _mm_unpackhi_epi8(b, _mm_setzero_si128())); + v128 rl = _mm_madd_epi16(l, l); + v128 rh = _mm_madd_epi16(h, h); + v128 c = _mm_cvtsi32_si128(32); + rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8)); + rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4)); + rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8)); + rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4)); + return _mm_add_epi64( + s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c)); +} + +SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { + return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); +} + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { + v64 lo_bits = v64_mullo_s16(a, b); + v64 hi_bits = v64_mulhi_s16(a, b); + return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), + v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return _mm_mullo_epi16(a, b); +} + +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return _mm_mulhi_epi16(a, b); +} + +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), + _mm_shuffle_epi32( + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); +#endif +} + +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + return _mm_packs_epi32( + _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), + _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); +#endif +} + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); +} + +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return _mm_cmpgt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return _mm_cmplt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8(0xff >> c), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128(c + 8); + return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), + _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c) +#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c) +#define v128_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v128_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v128_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ + _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) +#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) + +#endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h new file mode 100644 index 000000000..1896374ee --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_H +#define _V256_INTRINSICS_H + +#include +#include +#include +#include "./v256_intrinsics_c.h" +#include "./v128_intrinsics.h" +#include "./v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v256 v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); } +SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); } +SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); } +SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); } +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + return c_v256_from_v128(hi, lo); +} +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return c_v256_from_64(a, b, c, d); +} +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return c_v256_from_v64(a, b, c, d); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return c_v256_load_unaligned(p); +} +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return c_v256_load_aligned(p); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + c_v256_store_unaligned(p, a); +} +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + c_v256_store_aligned(p, a); +} + +SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) { + return c_v256_align(a, b, c); +} + +SIMD_INLINE v256 v256_zero() { return c_v256_zero(); } +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); } +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); } +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); } + +typedef uint32_t sad256_internal; +SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); } +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + return c_v256_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + return c_v256_sad_u8_sum(s); +} +typedef uint32_t ssd256_internal; +SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); } +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + return c_v256_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + return c_v256_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return c_v256_dotp_s16(a, b); +} +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); } + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); } +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); } +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); } +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); } + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); } +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); } +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); } +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); } +SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); } +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); } +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); } +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); } +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); } +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); } +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); } +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); } +SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); } +SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); } + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); } +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return c_v256_mullo_s16(a, b); +} +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return c_v256_mulhi_s16(a, b); +} +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return c_v256_mullo_s32(a, b); +} +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); } +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); } + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); } +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); } +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); } +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); } +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); } +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); } +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); } +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); } +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); } + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); } +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); } +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); } +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); } +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); } +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); } +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); } +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); } +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return c_v256_ziplo_128(a, b); +} +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return c_v256_ziphi_128(a, b); +} +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); } +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); } +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); } +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return c_v256_unziplo_8(a, b); +} +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return c_v256_unziphi_8(a, b); +} +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return c_v256_unziplo_16(a, b); +} +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return c_v256_unziphi_16(a, b); +} +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return c_v256_unziplo_32(a, b); +} +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return c_v256_unziphi_32(a, b); +} +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return c_v256_unpacklo_u8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return c_v256_unpackhi_u8_s16(a); +} +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return c_v256_unpacklo_s8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return c_v256_unpackhi_s8_s16(a); +} +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return c_v256_pack_s32_s16(a, b); +} +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return c_v256_pack_s16_u8(a, b); +} +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return c_v256_pack_s16_s8(a, b); +} +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return c_v256_unpack_u16_s32(a); +} +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return c_v256_unpack_s16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return c_v256_unpacklo_u16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return c_v256_unpacklo_s16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return c_v256_unpackhi_u16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return c_v256_unpackhi_s16_s32(a); +} +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + return c_v256_shuffle_8(a, pattern); +} +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return c_v256_pshuffle_8(a, pattern); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); } +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); } +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); } +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return c_v256_cmpgt_s16(a, b); +} +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return c_v256_cmplt_s16(a, b); +} +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); } + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return c_v256_shl_8(a, c); +} +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return c_v256_shr_u8(a, c); +} +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + return c_v256_shr_s8(a, c); +} +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return c_v256_shl_16(a, c); +} +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return c_v256_shr_u16(a, c); +} +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return c_v256_shr_s16(a, c); +} +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return c_v256_shl_32(a, c); +} +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return c_v256_shr_u32(a, c); +} +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return c_v256_shr_s32(a, c); +} + +SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { + return c_v256_shr_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) { + return c_v256_shl_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) { + return c_v256_shl_n_8(a, n); +} +SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) { + return c_v256_shl_n_16(a, n); +} +SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) { + return c_v256_shl_n_32(a, n); +} +SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) { + return c_v256_shr_n_u8(a, n); +} +SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) { + return c_v256_shr_n_u16(a, n); +} +SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) { + return c_v256_shr_n_u32(a, n); +} +SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) { + return c_v256_shr_n_s8(a, n); +} +SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) { + return c_v256_shr_n_s16(a, n); +} +SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) { + return c_v256_shr_n_s32(a, n); +} + +#endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h new file mode 100644 index 000000000..ba4ed719d --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_H +#define _V256_INTRINSICS_H + +#include "./v256_intrinsics_v128.h" + +#endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h new file mode 100644 index 000000000..f96ca7fa6 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_C_H +#define _V256_INTRINSICS_C_H + +#include +#include +#include "./v128_intrinsics_c.h" +#include "./aom_config.h" + +typedef union { + uint8_t u8[32]; + uint16_t u16[16]; + uint32_t u32[8]; + uint64_t u64[4]; + int8_t s8[32]; + int16_t s16[16]; + int32_t s32[8]; + int64_t s64[4]; + c_v64 v64[4]; + c_v128 v128[2]; +} c_v256; + +SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } + +SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } + +SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } + +SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { + c_v256 t; + t.v128[1] = hi; + t.v128[0] = lo; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, + uint64_t d) { + c_v256 t; + t.u64[3] = a; + t.u64[2] = b; + t.u64[1] = c; + t.u64[0] = d; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { + c_v256 t; + t.u64[3] = a.u64; + t.u64[2] = b.u64; + t.u64[1] = c.u64; + t.u64[0] = d.u64; + return t; +} + +SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { + c_v256 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 32; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 load at %p\n", p); + abort(); + } + return c_v256_load_unaligned(p); +} + +SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 32; c++) pp[c] = q[c]; +} + +SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 store at %p\n", p); + abort(); + } + c_v256_store_unaligned(p, a); +} + +SIMD_INLINE c_v256 c_v256_zero() { + c_v256 t; + t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { + return c_v128_dotp_s16(a.v128[1], b.v128[1]) + + c_v128_dotp_s16(a.v128[0], b.v128[0]); +} + +SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { + return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); +} + +typedef uint32_t c_sad256_internal; + +SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) + s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + return s; +} + +SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; } + +typedef uint32_t c_ssd256_internal; + +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } + +SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), + c_v128_or(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), + c_v128_xor(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), + c_v128_and(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), + c_v128_andn(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), + c_v128_add_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), + c_v128_add_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), + c_v128_sadd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), + c_v128_add_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { + c_v256 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; + t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; + t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; + t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; + return t; +} + +SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), + c_v128_sub_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), + c_v128_ssub_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), + c_v128_ssub_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), + c_v128_sub_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), + c_v128_ssub_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), + c_v128_ssub_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), + c_v128_sub_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { + c_v128 lo_bits = c_v128_mullo_s16(a, b); + c_v128 hi_bits = c_v128_mulhi_s16(a, b); + return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), + c_v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), + c_v128_mullo_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), + c_v128_mulhi_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), + c_v128_mullo_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), + c_v128_madd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), + c_v128_madd_us8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), + c_v128_avg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), + c_v128_rdavg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), + c_v128_avg_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), + c_v128_min_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), + c_v128_max_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), + c_v128_min_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), + c_v128_max_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), + c_v128_min_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), + c_v128_max_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), + c_v128_ziplo_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), + c_v128_ziplo_8(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), + c_v128_ziplo_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), + c_v128_ziplo_16(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), + c_v128_ziplo_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), + c_v128_ziplo_32(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), + c_v128_ziplo_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), + c_v128_ziplo_64(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[0], b.v128[0]); +} + +SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[1], b.v128[1]); +} + +SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); +} + +SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 16; i++) { + t.u8[i] = a.u8[i * 2 + 1]; + t.u8[i + 16] = b.u8[i * 2 + 1]; + } + } else { + for (i = 0; i < 16; i++) { + t.u8[i] = b.u8[i * 2]; + t.u8[i + 16] = a.u8[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) + : _c_v256_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) + : _c_v256_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 8; i++) { + t.u16[i] = a.u16[i * 2 + 1]; + t.u16[i + 8] = b.u16[i * 2 + 1]; + } + } else { + for (i = 0; i < 8; i++) { + t.u16[i] = b.u16[i * 2]; + t.u16[i + 8] = a.u16[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) + : _c_v256_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) + : _c_v256_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { + c_v256 t; + if (mode) { + t.u32[7] = b.u32[7]; + t.u32[6] = b.u32[5]; + t.u32[5] = b.u32[3]; + t.u32[4] = b.u32[1]; + t.u32[3] = a.u32[7]; + t.u32[2] = a.u32[5]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[7] = a.u32[6]; + t.u32[6] = a.u32[4]; + t.u32[5] = a.u32[2]; + t.u32[4] = a.u32[0]; + t.u32[3] = b.u32[6]; + t.u32[2] = b.u32[4]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) + : _c_v256_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) + : _c_v256_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), + c_v128_unpacklo_u8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), + c_v128_unpacklo_u8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), + c_v128_unpacklo_s8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), + c_v128_unpacklo_s8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), + c_v128_pack_s32_s16(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), + c_v128_pack_s16_u8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), + c_v128_pack_s16_s8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), + c_v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), + c_v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), + c_v128_unpacklo_u16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), + c_v128_unpacklo_s16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), + c_v128_unpacklo_u16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), + c_v128_unpacklo_s16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { + c_v256 t; + int c; + for (c = 0; c < 32; c++) { + if (pattern.u8[c] & ~31) { + fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c], + c); + abort(); + } + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) + : pattern.u8[c] & 31]; + } + return t; +} + +// Pairwise / dual-lane shuffle: shuffle two 128 bit lates. +SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { + return c_v256_from_v128( + c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), + c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), + c_v128_cmpgt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), + c_v128_cmplt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), + c_v128_cmpeq_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), + c_v128_cmpgt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), + c_v128_cmplt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), + c_v128_cmpeq_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { + if (n < 16) + return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), + c_v128_shr_n_byte(a.v128[0], 16 - n)), + c_v128_shl_n_byte(a.v128[0], n)); + else if (n > 16) + return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), + c_v128_zero()); + else + return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); +} + +SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { + if (n < 16) + return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), + c_v128_or(c_v128_shr_n_byte(a.v128[0], n), + c_v128_shl_n_byte(a.v128[1], 16 - n))); + else if (n > 16) + return c_v256_from_v128(c_v128_zero(), + c_v128_shr_n_byte(a.v128[1], n - 16)); + else + return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); +} + +SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { + if (SIMD_CHECK && c > 31) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) + : b; +} + +SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), + c_v128_shl_8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), + c_v128_shr_u8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), + c_v128_shr_s8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), + c_v128_shl_16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), + c_v128_shr_u16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), + c_v128_shr_s16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), + c_v128_shl_32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), + c_v128_shr_u32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), + c_v128_shr_s32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { + return c_v256_shl_8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { + return c_v256_shl_16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { + return c_v256_shl_32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { + return c_v256_shr_u8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { + return c_v256_shr_u16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { + return c_v256_shr_u32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { + return c_v256_shr_s8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { + return c_v256_shr_s16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { + return c_v256_shr_s32(a, n); +} + +#endif /* _V256_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h new file mode 100644 index 000000000..a4b334ea6 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_V128_H +#define _V256_INTRINSICS_V128_H + +#if HAVE_NEON +#include "./v128_intrinsics_arm.h" +#elif HAVE_SSE2 +#include "./v128_intrinsics_x86.h" +#else +#include "./v128_intrinsics.h" +#endif + +typedef struct { v128 lo, hi; } v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); } + +SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); } + +SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; } + +SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; } + +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + v256 t; + t.hi = hi; + t.lo = lo; + return t; +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), + v128_load_unaligned(p)); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), + v128_load_aligned(p)); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + v128_store_unaligned(p, a.lo); + v128_store_unaligned((uint8_t *)p + 16, a.hi); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + v128_store_aligned(p, a.lo); + v128_store_aligned((uint8_t *)p + 16, a.hi); +} + +SIMD_INLINE v256 v256_zero() { + return v256_from_v128(v128_zero(), v128_zero()); +} + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { + v128 t = v128_dup_8(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { + v128 t = v128_dup_16(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { + v128 t = v128_dup_32(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo); +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo); +} + +typedef struct { + sad128_internal hi; + sad128_internal lo; +} sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init() { + sad256_internal t; + t.hi = v128_sad_u8_init(); + t.lo = v128_sad_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + sad256_internal t; + t.hi = v128_sad_u8(s.hi, a.hi, b.hi); + t.lo = v128_sad_u8(s.lo, a.lo, b.lo); + return t; +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo); +} + +typedef struct { + ssd128_internal hi; + ssd128_internal lo; +} ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init() { + ssd256_internal t; + t.hi = v128_ssd_u8_init(); + t.lo = v128_ssd_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + ssd256_internal t; + t.hi = v128_ssd_u8(s.hi, a.hi, b.hi); + t.lo = v128_ssd_u8(s.lo, a.lo, b.lo); + return t; +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { + return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { + return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { + return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { + return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { + return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { + return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { + return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo)); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { + return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { + return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { + return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_abs_s16(v256 a) { + return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo)); +} + +SIMD_INLINE v256 v256_abs_s8(v256 a) { + return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo)); +} + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { + return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { + return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { + return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { + return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { + return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { + return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { + return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { + return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return v256_from_v128(a.lo, b.lo); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return v256_from_v128(a.hi, b.hi); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_16(a.hi, a.lo), + v128_unziplo_16(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_16(a.hi, a.lo), + v128_unziphi_16(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_32(a.hi, a.lo), + v128_unziplo_32(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_32(a.hi, a.lo), + v128_unziphi_32(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo)); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi)); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo)); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi)); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo), + v128_pack_s32_s16(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo), + v128_pack_s16_u8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo), + v128_pack_s16_s8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.lo), + v128_unpacklo_u16_s32(a.lo)); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.lo), + v128_unpacklo_s16_s32(a.lo)); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.hi), + v128_unpacklo_u16_s32(a.hi)); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.hi), + v128_unpacklo_s16_s32(a.hi)); +} + +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + v128 c16 = v128_dup_8(16); + v128 maskhi = v128_cmplt_s8(pattern.hi, c16); + v128 masklo = v128_cmplt_s8(pattern.lo, c16); + return v256_from_v128( + v128_or( + v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi), + v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)), + v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo), + v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)), + masklo))); +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return v256_from_v128( + v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), + v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n), \ + v128_shr_n_byte(a.lo, 16 - (n))), \ + v128_shl_n_byte(a.lo, (n))) \ + : v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \ + v128_zero())) + +#define v256_shr_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \ + v128_or(v128_shr_n_byte(a.lo, n), \ + v128_shl_n_byte(a.hi, 16 - (n)))) \ + : v256_from_v128(v128_zero(), \ + (n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi)) + +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) + +#define v256_shl_n_8(a, n) \ + v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n)) +#define v256_shl_n_16(a, n) \ + v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n)) +#define v256_shl_n_32(a, n) \ + v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n)) +#define v256_shr_n_u8(a, n) \ + v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n)) +#define v256_shr_n_u16(a, n) \ + v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n)) +#define v256_shr_n_u32(a, n) \ + v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n)) +#define v256_shr_n_s8(a, n) \ + v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n)) +#define v256_shr_n_s16(a, n) \ + v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n)) +#define v256_shr_n_s32(a, n) \ + v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n)) + +#endif /* _V256_INTRINSICS_V128_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h new file mode 100644 index 000000000..b82daab68 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_H +#define _V256_INTRINSICS_H + +#if !defined(__AVX2__) + +#include "./v256_intrinsics_v128.h" + +#else + +// The _m256i type seems to cause problems for g++'s mangling prior to +// version 5, but adding -fabi-version=0 fixes this. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ + defined(__AVX2__) && defined(__cplusplus) +#pragma GCC optimize "-fabi-version=0" +#endif + +#include +#include "./v128_intrinsics_x86.h" + +typedef __m256i v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); +} + +SIMD_INLINE v64 v256_low_v64(v256 a) { + return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); +} + +SIMD_INLINE v128 v256_low_v128(v256 a) { + return _mm256_extracti128_si256(a, 0); +} + +SIMD_INLINE v128 v256_high_v128(v256 a) { + return _mm256_extracti128_si256(a, 1); +} + +SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { + // gcc seems to be missing _mm256_set_m128i() + return _mm256_insertf128_si256( + _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return _mm256_load_si256((const __m256i *)p); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return _mm256_loadu_si256((const __m256i *)p); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + _mm256_store_si256((__m256i *)p, a); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + _mm256_storeu_si256((__m256i *)p, a); +} + +SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); } + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); } + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); } + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); } + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return _mm256_adds_epi16(a, b); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return _mm256_subs_epi16(a, b); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return _mm256_subs_epu16(a, b); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } + +SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } + +SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } + +// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit +// lanes of lower or upper halves of a 256bit vector because the +// unpack/pack intrinsics operate on the 256 bit input vector as 2 +// independent 128 bit vectors. +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_8(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_8(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_16(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_16(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_32(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_32(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_64(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_64(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return v256_from_v128(v256_low_v128(a), v256_low_v128(b)); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return v256_from_v128(v256_high_v128(a), v256_high_v128(b)); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)), + v128_unziplo_8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)), + v128_unziphi_8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)), + v128_unziplo_16(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)), + v128_unziphi_16(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)), + v128_unziplo_32(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)), + v128_unziphi_32(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)), + v128_unpacklo_u8_s16(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)), + v128_unpacklo_u8_s16(v256_high_v128(a))); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)), + v128_unpacklo_s8_s16(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)), + v128_unpacklo_s8_s16(v256_high_v128(a))); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)), + v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)), + v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)), + v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)), + v128_unpacklo_u16_s32(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)), + v128_unpacklo_s16_s32(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)), + v128_unpacklo_u16_s32(v256_high_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)), + v128_unpacklo_s16_s32(v256_high_v128(a))); +} +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + v128 c16 = v128_dup_8(16); + v128 hi = v256_high_v128(pattern); + v128 lo = v256_low_v128(pattern); + v128 maskhi = v128_cmplt_s8(hi, c16); + v128 masklo = v128_cmplt_s8(lo, c16); + return v256_from_v128( + v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi), + v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)), + maskhi)), + v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo), + v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)), + masklo))); +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return _mm256_shuffle_epi8(a, pattern); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + v256 r = _mm256_madd_epi16(a, b); +#if defined(__x86_64__) + v128 t; + r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), + _mm256_cvtepi32_epi64(v256_low_v128(r))); + t = v256_low_v128(_mm256_add_epi64( + r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); + return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); +#else + v128 l = v256_low_v128(r); + v128 h = v256_high_v128(r); + return (int64_t)_mm_cvtsi128_si32(l) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + + (int64_t)_mm_cvtsi128_si32(h) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); +#endif +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); + v128 lo = v256_low_v128(t); + v128 hi = v256_high_v128(t); + lo = v128_add_32(lo, hi); + return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); +} + +typedef v256 sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init() { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_sum(). + The result for more than 32 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +typedef v256 ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init() { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); + v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); + v256 rl = _mm256_madd_epi16(l, l); + v256 rh = _mm256_madd_epi16(h, h); + v128 c = _mm_cvtsi32_si128(32); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); + return _mm256_add_epi64( + s, + _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } + +SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return _mm256_mullo_epi16(a, b); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return _mm256_mulhi_epi16(a, b); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return _mm256_mullo_epi32(a, b); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return _mm256_madd_epi16(a, b); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return _mm256_maddubs_epi16(a, b); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return _mm256_sub_epi8( + _mm256_avg_epu8(a, b), + _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return _mm256_cmpgt_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a)); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return _mm256_cmpeq_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return _mm256_cmpgt_epi16(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a)); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return _mm256_cmpeq_epi16(a, b); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)), + _mm256_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8(0xff >> c), + _mm256_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128(c + 8); + return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), + _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +// _mm256_slli_si256 works on 128 bit lanes and can't be used +#define v256_shl_n_byte(a, n) \ + ((n) < 16 \ + ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n), \ + v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \ + v128_shl_n_byte(v256_low_v128(a), n)) \ + : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16), \ + v128_zero())) + +// _mm256_srli_si256 works on 128 bit lanes and can't be used +#define v256_shr_n_byte(a, n) \ + ((n) < 16 \ + ? _mm256_alignr_epi8( \ + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ + : ((n) > 16 \ + ? _mm256_srli_si256( \ + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \ + (n)-16) \ + : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)))) + +// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b) + +#define v256_shl_n_8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \ + _mm256_slli_epi16(a, c)) +#define v256_shr_n_u8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c)) +#define v256_shr_n_s8(a, c) \ + _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ + _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) +#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) +#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) +#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) +#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) +#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) +#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) +#endif + +#endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h new file mode 100644 index 000000000..ee2b683a4 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_H +#define _V64_INTRINSICS_H + +#include +#include +#include "./v64_intrinsics_c.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v64 v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); } +SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); } +SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); } +SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); } +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return c_v64_from_32(x, y); +} +SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); } +SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); } +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return c_v64_from_16(a, b, c, d); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return c_u32_load_unaligned(p); +} +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return c_u32_load_aligned(p); +} +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + c_u32_store_unaligned(p, a); +} +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + c_u32_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return c_v64_load_unaligned(p); +} +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return c_v64_load_aligned(p); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + c_v64_store_unaligned(p, a); +} +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + c_v64_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_align(v64 a, v64 b, c) { return c_v64_align(a, b, c); } + +SIMD_INLINE v64 v64_zero() { return c_v64_zero(); } +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); } +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); } +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); } +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); } +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); } +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); } +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); } +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); } +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); } +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); } +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); } +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); } +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); } +SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); } +SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); } + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); } +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); } +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); } +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); } +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); } +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); } +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); } +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); } +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); } +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); } +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); } +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); } +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + return c_v64_pack_s32_s16(a, b); +} +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + return c_v64_pack_s16_u8(a, b); +} +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + return c_v64_pack_s16_s8(a, b); +} +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return c_v64_unpacklo_u16_s32(a); +} +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return c_v64_unpacklo_s16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return c_v64_unpackhi_u16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return c_v64_unpackhi_s16_s32(a); +} +SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) { + return c_v64_shuffle_8(a, pattern); +} + +typedef uint32_t sad64_internal; +SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); } +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return c_v64_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { + return c_v64_sad_u8_sum(s); +} +typedef uint32_t ssd64_internal; +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); } +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + return c_v64_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { + return c_v64_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); } +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); } +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); } +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); } +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); } +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); } +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); } +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); } +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); } +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); } +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); } + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); } +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); } +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); } +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); } +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); } +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); } +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); } +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); } +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); } +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); } +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); } +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); } +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); } +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); } +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); } +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); } +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); } +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) { + return c_v64_shr_u16(a, n); +} +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) { + return c_v64_shr_s16(a, n); +} +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); } +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) { + return c_v64_shr_u32(a, n); +} +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) { + return c_v64_shr_s32(a, n); +} +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) { + return c_v64_shr_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) { + return c_v64_shl_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { + return c_v64_shl_n_8(a, c); +} +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { + return c_v64_shr_n_u8(a, c); +} +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { + return c_v64_shr_n_s8(a, c); +} +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { + return c_v64_shl_n_16(a, c); +} +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return c_v64_shr_n_u16(a, c); +} +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return c_v64_shr_n_s16(a, c); +} +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { + return c_v64_shl_n_32(a, c); +} +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return c_v64_shr_n_u32(a, c); +} +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return c_v64_shr_n_s32(a, c); +} + +#endif /* _V64_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h new file mode 100644 index 000000000..c7574eef5 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_H +#define _V64_INTRINSICS_H + +#include +#include "./v64_intrinsics_arm.h" +#include "aom_ports/arm.h" + +#ifdef AOM_INCOMPATIBLE_GCC +#error Incompatible gcc +#endif + +typedef int64x1_t v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { + return vget_lane_u32(vreinterpret_u32_s64(a), 0); +} + +SIMD_INLINE uint32_t v64_high_u32(v64 a) { + return vget_lane_u32(vreinterpret_u32_s64(a), 1); +} + +SIMD_INLINE int32_t v64_low_s32(v64 a) { + return vget_lane_s32(vreinterpret_s32_s64(a), 0); +} + +SIMD_INLINE int32_t v64_high_s32(v64 a) { + return vget_lane_s32(vreinterpret_s32_s64(a), 1); +} + +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 | + d); +} + +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return vcreate_s64((uint64_t)x << 32 | y); +} + +SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } + +SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; } + +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0); +} + +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { +#if defined(__clang__) + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); +#elif defined(__CC_ARM) + *(__packed uint32_t *)p) = a; +#elif defined(__GNUC__) + *((__attribute((packed)) uint32_t *)p) = a; +#else + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); +#endif +} + +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p)); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return v64_load_aligned(p); +} + +SIMD_INLINE void v64_store_aligned(void *p, v64 r) { + vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { + vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); +} + +// The following function requires an immediate. +// Some compilers will check this if it's optimising, others wont. +SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + return c ? vreinterpret_s64_s8( + vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) + : b; +#else + return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)) + : b; +#endif +} + +SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); } + +SIMD_INLINE v64 v64_dup_8(uint8_t x) { + return vreinterpret_s64_u8(vdup_n_u8(x)); +} + +SIMD_INLINE v64 v64_dup_16(uint16_t x) { + return vreinterpret_s64_u16(vdup_n_u16(x)); +} + +SIMD_INLINE v64 v64_dup_32(uint32_t x) { + return vreinterpret_s64_u32(vdup_n_u32(x)); +} + +SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { + int64x2_t r = vpaddlq_s32(vpaddlq_s16( + vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))))); + return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); +} + +SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { + int64x2_t r = + vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); + return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); +} + +SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { + return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); +} + +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { + return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))); +} + +typedef uint16x8_t sad64_internal; + +SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); } + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); +} + +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { + uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); + return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); +} + +typedef int64x1_t ssd64_internal; + +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { + return (ssd64_internal)(uint64_t)0; +} + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); + uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t))); + return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r))); +} + +SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { + return (uint32_t)(uint64_t)s; +} + +SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } + +SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); } + +SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); } + +SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); } + +SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_add_32(v64 x, v64 y) { + return vreinterpret_s64_u32( + vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y))); +} + +SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) { + return vreinterpret_s64_s32( + vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); +} + +SIMD_INLINE v64 v64_abs_s16(v64 x) { + return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x))); +} + +SIMD_INLINE v64 v64_abs_s8(v64 x) { + return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x))); +} + +SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { + return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); +} + +SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { + return vreinterpret_s64_s32( + vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); +} + +SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { + int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)); + return vreinterpret_s64_s32( + vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))), + vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t))))); +} + +SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { + return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16( + vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)), + vreinterpret_s8_s64(y)), + vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7))))); +} + +SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + +SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[0]); +} + +SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[1]); +} + +SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { + int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); + return vreinterpret_s64_s16(r.val[0]); +} + +SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { + int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); + return vreinterpret_s64_s16(r.val[1]); +} + +SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { + int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); + return vreinterpret_s64_s32(r.val[0]); +} + +SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { + int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); + return vreinterpret_s64_s32(r.val[1]); +} + +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { + return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { + return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { + return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { + return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a)))); +} + +SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { + return vreinterpret_s64_s16(vqmovn_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); +} + +SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { + return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); +} + +SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { + return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); +} + +SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { + uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[0]); +} + +SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { + uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[1]); +} + +SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { + uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpret_s64_u16(r.val[0]); +} + +SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { + uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpret_s64_u16(r.val[1]); +} + +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { + return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) { + return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) { + return vreinterpret_s64_s32( + vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) { + return vreinterpret_s64_u32( + vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x)))); +} + +SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { + return vreinterpret_s64_u8( + vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern))); +} + +SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c))); +} + +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c))); +} + +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { + return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c))); +} + +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { + return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c))); +} + +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { + return vreinterpret_s64_u16( + vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c))); +} + +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { + return vreinterpret_s64_s16( + vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c))); +} + +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { + return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c))); +} + +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { + return vreinterpret_s64_u32( + vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c))); +} + +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { + return vreinterpret_s64_s32( + vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c))); +} + +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { + return vshl_n_s64(a, c * 8); +} + +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { + return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a; +} + +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { + return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)); +} + +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { + return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)); +} + +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { + return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)); +} + +#else + +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { + return v64_from_64(v64_u64(a) << c * 8); +} + +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { + return v64_from_64(v64_u64(a) >> c * 8); +} + +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); } + +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); } + +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); } + +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); } + +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return v64_shr_u16(a, c); +} + +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return v64_shr_s16(a, c); +} + +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); } + +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return v64_shr_u32(a, c); +} + +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return v64_shr_s32(a, c); +} + +#endif + +#endif /* _V64_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h new file mode 100644 index 000000000..5032238b6 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h @@ -0,0 +1,919 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_C_H +#define _V64_INTRINSICS_C_H + +/* Note: This implements the intrinsics in plain, unoptimised C. + Intended for reference, porting or debugging. */ + +#include +#include +#include "./aom_config.h" + +typedef union { + uint8_t u8[8]; + uint16_t u16[4]; + uint32_t u32[2]; + uint64_t u64; + int8_t s8[8]; + int16_t s16[4]; + int32_t s32[2]; + int64_t s64; +} c_v64; + +SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; } + +SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { + return a.u32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; } + +SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { + return a.s32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { + c_v64 t; + t.u32[!CONFIG_BIG_ENDIAN] = x; + t.u32[CONFIG_BIG_ENDIAN] = y; + return t; +} + +SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) { + c_v64 t; + t.u64 = x; + return t; +} + +SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; } + +SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, + uint16_t d) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + t.u16[0] = a; + t.u16[1] = b; + t.u16[2] = c; + t.u16[3] = d; + } else { + t.u16[3] = a; + t.u16[2] = b; + t.u16[1] = c; + t.u16[0] = d; + } + return t; +} + +SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) { + uint32_t t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 4; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 4; c++) pp[c] = q[c]; +} + +SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 load at %p\n", p); + abort(); + } + return c_u32_load_unaligned(p); +} + +SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 store at %p\n", p); + abort(); + } + c_u32_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) { + c_v64 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 8; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p); + abort(); + } + return c_v64_load_unaligned(p); +} + +SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) { + uint8_t *q = (uint8_t *)p; + uint8_t *r = (uint8_t *)&a; + int c; + for (c = 0; c < 8; c++) q[c] = r[c]; +} + +SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p); + abort(); + } + c_v64_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_zero() { + c_v64 t; + t.u64 = 0; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) { + c_v64 t; + t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] = + t.u8[7] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) { + c_v64 t; + t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) { + c_v64 t; + t.u32[0] = t.u32[1] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767 + ? 32767 + : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768 + ? -32768 + : (int32_t)a.s16[c] + (int32_t)b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]); + t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c]; + t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d); + } + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768 + ? -32768 + : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767 + ? 32767 + : (int32_t)a.s16[c] - (int32_t)b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = + (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]); + t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]; + return t; +} + +SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = a.u8[7]; + t.u8[6] = b.u8[7]; + t.u8[5] = a.u8[6]; + t.u8[4] = b.u8[6]; + t.u8[3] = a.u8[5]; + t.u8[2] = b.u8[5]; + t.u8[1] = a.u8[4]; + t.u8[0] = b.u8[4]; + } else { + t.u8[7] = a.u8[3]; + t.u8[6] = b.u8[3]; + t.u8[5] = a.u8[2]; + t.u8[4] = b.u8[2]; + t.u8[3] = a.u8[1]; + t.u8[2] = b.u8[1]; + t.u8[1] = a.u8[0]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = a.u16[3]; + t.u16[2] = b.u16[3]; + t.u16[1] = a.u16[2]; + t.u16[0] = b.u16[2]; + } else { + t.u16[3] = a.u16[1]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[0]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u32[1] = a.u32[1]; + t.u32[0] = b.u32[1]; + } else { + t.u32[1] = a.u32[0]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = b.u8[7]; + t.u8[6] = b.u8[5]; + t.u8[5] = b.u8[3]; + t.u8[4] = b.u8[1]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[7] = a.u8[6]; + t.u8[6] = a.u8[4]; + t.u8[5] = a.u8[2]; + t.u8[4] = a.u8[0]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = b.u16[3]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[3] = a.u16[2]; + t.u16[2] = a.u16[0]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1) + : _c_v64_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0) + : _c_v64_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[3 + endian]; + t.s16[2] = (int16_t)a.u8[2 + endian]; + t.s16[1] = (int16_t)a.u8[1 + endian]; + t.s16[0] = (int16_t)a.u8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[7 - endian]; + t.s16[2] = (int16_t)a.u8[6 - endian]; + t.s16[1] = (int16_t)a.u8[5 - endian]; + t.s16[0] = (int16_t)a.u8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[3 + endian]; + t.s16[2] = (int16_t)a.s8[2 + endian]; + t.s16[1] = (int16_t)a.s8[1 + endian]; + t.s16[0] = (int16_t)a.s8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[7 - endian]; + t.s16[2] = (int16_t)a.s8[6 - endian]; + t.s16[1] = (int16_t)a.s8[5 - endian]; + t.s16[0] = (int16_t)a.s8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1]; + t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0]; + t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1]; + t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3]; + t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2]; + t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1]; + t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0]; + t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3]; + t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2]; + t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1]; + t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3]; + t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2]; + t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1]; + t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0]; + t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3]; + t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2]; + t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1]; + t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + if (SIMD_CHECK && (pattern.u8[c] & ~7)) { + fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n", + pattern.u8[c], c); + abort(); + } + t.u8[c] = + a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7]; + } + return t; +} + +SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) { + return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] + + a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] + + a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0]; +} + +SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) { + return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) + + (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]); +} + +SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) { + return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] + + a.u8[0]; +} + +SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) { + return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0]; +} + +typedef uint32_t c_sad64_internal; + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; } + +SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) + s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + return s; +} + +SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; } + +typedef uint32_t c_ssd64_internal; + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; } + +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; } + +SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 | b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 ^ b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & ~b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]); + t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1]; + t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3]; + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) { + c_v64 t; + int32_t u; + u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1]; + t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3]; + t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5]; + t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7]; + t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift left %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined s8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift left %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: undefined s16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift left %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] << n; + t.u32[0] = a.u32[0] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift right %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] >> n; + t.u32[0] = a.u32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined s32 shift right %d\n", n); + abort(); + } + t.s32[1] = a.s32[1] >> n; + t.s32[0] = a.s32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 >> i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 << i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) { + if (SIMD_CHECK && c > 7) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b; +} + +SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) { + return c_v64_shl_8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) { + return c_v64_shr_u8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) { + return c_v64_shr_s8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) { + return c_v64_shl_16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) { + return c_v64_shr_u16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) { + return c_v64_shr_s16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) { + return c_v64_shl_32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) { + return c_v64_shr_u32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { + return c_v64_shr_s32(a, c); +} + +#endif /* _V64_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h new file mode 100644 index 000000000..8dcc9f6fc --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_H +#define _V64_INTRINSICS_H + +#include +#if defined(__SSSE3__) +#include +#endif +#if defined(__SSE4_1__) +#include +#endif + +typedef __m128i v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE uint32_t v64_high_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); } + +SIMD_INLINE int32_t v64_high_s32(v64 a) { + return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return _mm_packs_epi32( + _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d), + _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return _mm_set_epi32(0, 0, x, y); +} + +SIMD_INLINE v64 v64_from_64(uint64_t x) { +#ifdef __x86_64__ + return _mm_cvtsi64_si128(x); +#else + return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x); +#endif +} + +SIMD_INLINE uint64_t v64_u64(v64 x) { + return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32); +} + +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +// The following function requires an immediate. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ +#define v64_align(a, b, c) \ + ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) +#else +#define v64_align(a, b, c) \ + ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \ + : (b)) +#endif + +SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); } + +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); } + +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); } + +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v64 v64_abs_s16(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v64 v64_abs_s8(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8); +} + +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi32(t, t); +} + +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packus_epi16(t, t); +} + +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi16(t, t); +} + +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0d0b0907050301LL)); +#else + return _mm_packus_epi16( + _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0e0c0a0806040200LL)); +#else + return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0e0b0a07060302LL)); +#else + return _mm_packs_epi32( + _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0d0c090805040100LL)); +#else + return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { + return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8); +} + +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16); +} + +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return _mm_srli_si128( + _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8); +} + +SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v64 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + char *selected = (char *)&output; + int counter; + + for (counter = 0; counter < 8; counter++) { + selected[counter] = input[index[counter]]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { + __m128i r, r1, r2, z; + z = _mm_setzero_si128(); + r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8), + _mm_unpacklo_epi8(b, z)); + r2 = _mm_srli_si128(r1, 8); + r = _mm_add_epi32(r1, r2); + r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); + return ((int32_t)v64_low_u32(r)) >> 8; +} + +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { + __m128i r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + __m128i x = _mm_cvtepi32_epi64(r); + return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(r); +#endif +} + +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { + return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128())); +} + +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { + return v64_dotp_s16(a, v64_dup_16(1)); +} + +typedef v64 sad64_internal; + +SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +typedef v64 ssd64_internal; + +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b)); + v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b)); + v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h)); + return _mm_add_epi64( + s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4)))); +} + +SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); } + +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); } + +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_mul_epu32(a, b), + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4))); +#endif +} + +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)); + return _mm_packs_epi32(t, t); +#endif +} + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); +} + +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8(0xff >> c), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { + return _mm_packs_epi16( + _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a); +} + +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) +#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) +#define v64_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v64_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v64_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) +#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) + +#endif /* _V64_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c new file mode 100644 index 000000000..141bf01c7 --- /dev/null +++ b/third_party/aom/aom_dsp/ssim.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 16; i++, s += sp, r += rp) { + for (j = 0; j < 16; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +#endif // CONFIG_HIGHBITDEPTH + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + int64_t ssim_n, ssim_d; + int64_t c1, c2; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + c1 = c2 = 0; + assert(0); + } + + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + + ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * + ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); +} + +#if CONFIG_HIGHBITDEPTH +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} +#endif // CONFIG_HIGHBITDEPTH + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +static double aom_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +#if CONFIG_HIGHBITDEPTH +static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} +#endif // CONFIG_HIGHBITDEPTH + +double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight) { + double a, b, c; + double ssimv; + + a = aom_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, source->y_crop_height); + + b = aom_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); + + c = aom_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + +// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity +// +// Re working out the math -> +// +// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / +// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) +// +// mean(x) = sum(x) / n +// +// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) +// +// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) +// +// ssim(x,y) = +// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * +// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ +// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) +// +// factoring out n*n +// +// ssim(x,y) = +// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * +// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) +// +// Replace c1 with n*n * c1 for the final step that leads to this code: +// The final step scales by 12 bits so we don't lose precision in the constants. + +static double ssimv_similarity(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + + // Since these variables are unsigned sums, convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} + +// The first term of the ssim metric is a luminance factor. +// +// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) +// +// This luminance factor is super sensitive to the dark side of luminance +// values and completely insensitive on the white side. check out 2 sets +// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 +// 2*250*252/ (250^2+252^2) => .99999997 +// +// As a result in this tweaked version of the calculation in which the +// luminance is taken as percentage off from peak possible. +// +// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count +// +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; + const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); + + // Since these variables are unsigned, sums convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { + aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); +} + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { + double dssim_total = 0; + double ssim_total = 0; + double ssim2_total = 0; + double inconsistency_total = 0; + int i, j; + int c = 0; + double norm; + double old_ssim_total = 0; + aom_clear_system_state(); + // We can sample points as frequently as we like start with 1 per 4x4. + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4, ++c) { + Ssimv sv = { 0 }; + double ssim; + double ssim2; + double dssim; + uint32_t var_new; + uint32_t var_old; + uint32_t mean_new; + uint32_t mean_old; + double ssim_new; + double ssim_old; + + // Not sure there's a great way to handle the edge pixels + // in ssim when using a window. Seems biased against edge pixels + // however you handle this. This uses only samples that are + // fully in the frame. + if (j + 8 <= width && i + 8 <= height) { + ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); + } + + ssim = ssimv_similarity(&sv, 64); + ssim2 = ssimv_similarity2(&sv, 64); + + sv.ssim = ssim2; + + // dssim is calculated to use as an actual error metric and + // is scaled up to the same range as sum square error. + // Since we are subsampling every 16th point maybe this should be + // *16 ? + dssim = 255 * 255 * (1 - ssim2) / 2; + + // Here I introduce a new error metric: consistency-weighted + // SSIM-inconsistency. This metric isolates frames where the + // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much + // sharper or blurrier than the others. Higher values indicate a + // temporally inconsistent SSIM. There are two ideas at work: + // + // 1) 'SSIM-inconsistency': the total inconsistency value + // reflects how much SSIM values are changing between this + // source / reference frame pair and the previous pair. + // + // 2) 'consistency-weighted': weights de-emphasize areas in the + // frame where the scene content has changed. Changes in scene + // content are detected via changes in local variance and local + // mean. + // + // Thus the overall measure reflects how inconsistent the SSIM + // values are, over consistent regions of the frame. + // + // The metric has three terms: + // + // term 1 -> uses change in scene Variance to weight error score + // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term 2 -> uses change in local scene luminance to weight error + // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term3 -> measures inconsistency in ssim scores between frames + // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). + // + // This term compares the ssim score for the same location in 2 + // subsequent frames. + var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; + var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; + mean_new = sv.sum_s; + mean_old = sv2[c].sum_s; + ssim_new = sv.ssim; + ssim_old = sv2[c].ssim; + + if (do_inconsistency) { + // We do the metric once for every 4x4 block in the image. Since + // we are scaling the error to SSE for use in a psnr calculation + // 1.0 = 4x4x255x255 the worst error we can possibly have. + static const double kScaling = 4. * 4 * 255 * 255; + + // The constants have to be non 0 to avoid potential divide by 0 + // issues other than that they affect kind of a weighting between + // the terms. No testing of what the right terms should be has been + // done. + static const double c1 = 1, c2 = 1, c3 = 1; + + // This measures how much consistent variance is in two consecutive + // source frames. 1.0 means they have exactly the same variance. + const double variance_term = + (2.0 * var_old * var_new + c1) / + (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); + + // This measures how consistent the local mean are between two + // consecutive frames. 1.0 means they have exactly the same mean. + const double mean_term = + (2.0 * mean_old * mean_new + c2) / + (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); + + // This measures how consistent the ssims of two + // consecutive frames is. 1.0 means they are exactly the same. + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); + + double this_inconsistency; + + // Floating point math sometimes makes this > 1 by a tiny bit. + // We want the metric to scale between 0 and 1.0 so we can convert + // it to an snr scaled value. + if (ssim_term > 1) ssim_term = 1; + + // This converts the consistency metric to an inconsistency metric + // ( so we can scale it like psnr to something like sum square error. + // The reason for the variance and mean terms is the assumption that + // if there are big changes in the source we shouldn't penalize + // inconsistency in ssim scores a bit less as it will be less visible + // to the user. + this_inconsistency = (1 - ssim_term) * variance_term * mean_term; + + this_inconsistency *= kScaling; + inconsistency_total += this_inconsistency; + } + sv2[c] = sv; + ssim_total += ssim; + ssim2_total += ssim2; + dssim_total += dssim; + + old_ssim_total += ssim_old; + } + old_ssim_total += 0; + } + + norm = 1. / (width / 4) / (height / 4); + ssim_total *= norm; + ssim2_total *= norm; + m->ssim2 = ssim2_total; + m->ssim = ssim_total; + if (old_ssim_total == 0) inconsistency_total = 0; + + m->ssimc = inconsistency_total; + + m->dssim = dssim_total; + return inconsistency_total; +} + +#if CONFIG_HIGHBITDEPTH +double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd) { + double a, b, c; + double ssimv; + uint32_t shift = 0; + + assert(bd >= in_bd); + shift = bd - in_bd; + + a = aom_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, shift); + + b = aom_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + c = aom_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h new file mode 100644 index 000000000..902735e50 --- /dev/null +++ b/third_party/aom/aom_dsp/ssim.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_SSIM_H_ +#define AOM_DSP_SSIM_H_ + +#define MAX_SSIM_DB 100.0; + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./aom_config.h" +#include "aom_scale/yv12config.h" + +// metrics used for calculating ssim, ssim2, dssim, and ssimc +typedef struct { + // source sum ( over 8x8 region ) + uint32_t sum_s; + + // reference sum (over 8x8 region ) + uint32_t sum_r; + + // source sum squared ( over 8x8 region ) + uint32_t sum_sq_s; + + // reference sum squared (over 8x8 region ) + uint32_t sum_sq_r; + + // sum of source times reference (over 8x8 region) + uint32_t sum_sxr; + + // calculated ssim score between source and reference + double ssim; +} Ssimv; + +// metrics collected on a frame basis +typedef struct { + // ssim consistency error metric ( see code for explanation ) + double ssimc; + + // standard ssim + double ssim; + + // revised ssim ( see code for explanation) + double ssim2; + + // ssim restated as an error metric like sse + double dssim; + + // dssim converted to decibels + double dssimd; + + // ssimc converted to decibels + double ssimcd; +} Metrics; + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency); + +double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight); + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd); + +#if CONFIG_HIGHBITDEPTH +double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd); +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_SSIM_H_ diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c new file mode 100644 index 000000000..8dda96efb --- /dev/null +++ b/third_party/aom/aom_dsp/subtract.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +void aom_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r, c; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { + int r, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + (void)bd; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c new file mode 100644 index 000000000..b9155fdc0 --- /dev/null +++ b/third_party/aom/aom_dsp/sum_squares.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width, + int height) { + int r, c; + uint64_t ss = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const int16_t v = src[c]; + ss += v * v; + } + src += src_stride; + } + + return ss; +} + +uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) { + uint64_t ss = 0; + do { + const int16_t v = *src++; + ss += v * v; + } while (--n); + + return ss; +} diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h new file mode 100644 index 000000000..a5e964aad --- /dev/null +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_TXFM_COMMON_H_ +#define AOM_DSP_TXFM_COMMON_H_ + +#include "aom_dsp/aom_dsp_common.h" + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) + +// Constants: +// for (int i = 1; i< 32; ++i) +// printf("static const int cospi_%d_64 = %.0f;\n", i, +// round(16384 * cos(i*M_PI/64))); +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const tran_high_t cospi_1_64 = 16364; +static const tran_high_t cospi_2_64 = 16305; +static const tran_high_t cospi_3_64 = 16207; +static const tran_high_t cospi_4_64 = 16069; +static const tran_high_t cospi_5_64 = 15893; +static const tran_high_t cospi_6_64 = 15679; +static const tran_high_t cospi_7_64 = 15426; +static const tran_high_t cospi_8_64 = 15137; +static const tran_high_t cospi_9_64 = 14811; +static const tran_high_t cospi_10_64 = 14449; +static const tran_high_t cospi_11_64 = 14053; +static const tran_high_t cospi_12_64 = 13623; +static const tran_high_t cospi_13_64 = 13160; +static const tran_high_t cospi_14_64 = 12665; +static const tran_high_t cospi_15_64 = 12140; +static const tran_high_t cospi_16_64 = 11585; +static const tran_high_t cospi_17_64 = 11003; +static const tran_high_t cospi_18_64 = 10394; +static const tran_high_t cospi_19_64 = 9760; +static const tran_high_t cospi_20_64 = 9102; +static const tran_high_t cospi_21_64 = 8423; +static const tran_high_t cospi_22_64 = 7723; +static const tran_high_t cospi_23_64 = 7005; +static const tran_high_t cospi_24_64 = 6270; +static const tran_high_t cospi_25_64 = 5520; +static const tran_high_t cospi_26_64 = 4756; +static const tran_high_t cospi_27_64 = 3981; +static const tran_high_t cospi_28_64 = 3196; +static const tran_high_t cospi_29_64 = 2404; +static const tran_high_t cospi_30_64 = 1606; +static const tran_high_t cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const tran_high_t sinpi_1_9 = 5283; +static const tran_high_t sinpi_2_9 = 9929; +static const tran_high_t sinpi_3_9 = 13377; +static const tran_high_t sinpi_4_9 = 15212; + +// 16384 * sqrt(2) +static const tran_high_t Sqrt2 = 23170; + +#endif // AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c new file mode 100644 index 000000000..9fc0db783 --- /dev/null +++ b/third_party/aom/aom_dsp/variance.c @@ -0,0 +1,1249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/aom_filter.h" + +uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + int diff = a[c] - b[c]; + distortion += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return distortion; +} + +uint32_t aom_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse); +} + +uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse); +} + +uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse); +} + +static void variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { + uint32_t sse; + int sum; + variance(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +#define VAR(W, H) \ + uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ + } + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + \ + return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ + } + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define VARIANCES(W, H) \ + VAR(W, H) \ + SUBPIX_VAR(W, H) \ + SUBPIX_AVG_VAR(W, H) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +VARIANCES(128, 128) +VARIANCES(128, 64) +VARIANCES(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) +VARIANCES(4, 2) +VARIANCES(2, 4) +VARIANCES(2, 2) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +// Get pred block from up-sampled reference. +void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j, k; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0, k = 0; j < width; j++, k += 8) { + comp_pred[j] = ref[k]; + } + comp_pred += width; + ref += stride; + } +} + +void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = ref[(j << 3)] + pred[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += stride; + } +} + +#if CONFIG_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h) { + uint64_t sse; + int64_t sum; + highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_GET_VAR(S) \ + void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +HIGHBD_VARIANCES(128, 128) +HIGHBD_VARIANCES(128, 64) +HIGHBD_VARIANCES(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) +HIGHBD_VARIANCES(4, 2) +HIGHBD_VARIANCES(2, 4) +HIGHBD_VARIANCES(2, 2) + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height, + const uint8_t *ref8, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = ref[(j << 3)]; + } + comp_pred += width; + ref += stride; + } +} + +void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[(j << 3)]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += stride; + } +} +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_AV1 && CONFIG_EXT_INTER +void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const uint8_t *m, int m_stride, int w, int h, + unsigned int *sse, int *sum) { + int i, j; + + int64_t sum64 = 0; + uint64_t sse64 = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = (a[j] - b[j]) * (m[j]); + sum64 += diff; + sse64 += diff * diff; + } + + a += a_stride; + b += b_stride; + m += m_stride; + } + sum64 = (sum64 >= 0) ? sum64 : -sum64; + *sum = (int)ROUND_POWER_OF_TWO(sum64, 6); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12); +} + +#define MASK_VAR(W, H) \ + unsigned int aom_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk, \ + msk_stride, sse); \ + } + +MASK_VAR(4, 4) +MASK_SUBPIX_VAR(4, 4) + +MASK_VAR(4, 8) +MASK_SUBPIX_VAR(4, 8) + +MASK_VAR(8, 4) +MASK_SUBPIX_VAR(8, 4) + +MASK_VAR(8, 8) +MASK_SUBPIX_VAR(8, 8) + +MASK_VAR(8, 16) +MASK_SUBPIX_VAR(8, 16) + +MASK_VAR(16, 8) +MASK_SUBPIX_VAR(16, 8) + +MASK_VAR(16, 16) +MASK_SUBPIX_VAR(16, 16) + +MASK_VAR(16, 32) +MASK_SUBPIX_VAR(16, 32) + +MASK_VAR(32, 16) +MASK_SUBPIX_VAR(32, 16) + +MASK_VAR(32, 32) +MASK_SUBPIX_VAR(32, 32) + +MASK_VAR(32, 64) +MASK_SUBPIX_VAR(32, 64) + +MASK_VAR(64, 32) +MASK_SUBPIX_VAR(64, 32) + +MASK_VAR(64, 64) +MASK_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +MASK_VAR(64, 128) +MASK_SUBPIX_VAR(64, 128) + +MASK_VAR(128, 64) +MASK_SUBPIX_VAR(128, 64) + +MASK_VAR(128, 128) +MASK_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_HIGHBITDEPTH +void highbd_masked_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, + int m_stride, int w, int h, uint64_t *sse, + int64_t *sum) { + int i, j; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = (a[j] - b[j]) * (m[j]); + *sum += (int64_t)diff; + *sse += (int64_t)diff * diff; + } + + a += a_stride; + b += b_stride; + m += m_stride; + } + *sum = (*sum >= 0) ? *sum : -*sum; + *sum = ROUND_POWER_OF_TWO(*sum, 6); + *sse = ROUND_POWER_OF_TWO(*sse, 12); +} + +void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, const uint8_t *m, int m_stride, int w, + int h, unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +void highbd_10_masked_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +void highbd_12_masked_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HIGHBD_MASK_VAR(W, H) \ + unsigned int aom_highbd_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \ + &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ + sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ + sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } \ + \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } \ + \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } + +HIGHBD_MASK_VAR(4, 4) +HIGHBD_MASK_SUBPIX_VAR(4, 4) + +HIGHBD_MASK_VAR(4, 8) +HIGHBD_MASK_SUBPIX_VAR(4, 8) + +HIGHBD_MASK_VAR(8, 4) +HIGHBD_MASK_SUBPIX_VAR(8, 4) + +HIGHBD_MASK_VAR(8, 8) +HIGHBD_MASK_SUBPIX_VAR(8, 8) + +HIGHBD_MASK_VAR(8, 16) +HIGHBD_MASK_SUBPIX_VAR(8, 16) + +HIGHBD_MASK_VAR(16, 8) +HIGHBD_MASK_SUBPIX_VAR(16, 8) + +HIGHBD_MASK_VAR(16, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 16) + +HIGHBD_MASK_VAR(16, 32) +HIGHBD_MASK_SUBPIX_VAR(16, 32) + +HIGHBD_MASK_VAR(32, 16) +HIGHBD_MASK_SUBPIX_VAR(32, 16) + +HIGHBD_MASK_VAR(32, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 32) + +HIGHBD_MASK_VAR(32, 64) +HIGHBD_MASK_SUBPIX_VAR(32, 64) + +HIGHBD_MASK_VAR(64, 32) +HIGHBD_MASK_SUBPIX_VAR(64, 32) + +HIGHBD_MASK_VAR(64, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_VAR(64, 128) +HIGHBD_MASK_SUBPIX_VAR(64, 128) + +HIGHBD_MASK_VAR(128, 64) +HIGHBD_MASK_SUBPIX_VAR(128, 64) + +HIGHBD_MASK_VAR(128, 128) +HIGHBD_MASK_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_EXT_INTER + +#if CONFIG_AV1 && CONFIG_MOTION_VAR +static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +#define OBMC_VAR(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ + } + +OBMC_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 4) + +OBMC_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 8) + +OBMC_VAR(8, 4) +OBMC_SUBPIX_VAR(8, 4) + +OBMC_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 8) + +OBMC_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 16) + +OBMC_VAR(16, 8) +OBMC_SUBPIX_VAR(16, 8) + +OBMC_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 16) + +OBMC_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 32) + +OBMC_VAR(32, 16) +OBMC_SUBPIX_VAR(32, 16) + +OBMC_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 32) + +OBMC_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 64) + +OBMC_VAR(64, 32) +OBMC_SUBPIX_VAR(64, 32) + +OBMC_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +OBMC_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 128) + +OBMC_VAR(128, 64) +OBMC_SUBPIX_VAR(128, 64) + +OBMC_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_HIGHBITDEPTH +static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + uint64_t *sse, int64_t *sum) { + int i, j; + uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HIGHBD_OBMC_VAR(W, H) \ + unsigned int aom_highbd_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } + +HIGHBD_OBMC_VAR(4, 4) +HIGHBD_OBMC_SUBPIX_VAR(4, 4) + +HIGHBD_OBMC_VAR(4, 8) +HIGHBD_OBMC_SUBPIX_VAR(4, 8) + +HIGHBD_OBMC_VAR(8, 4) +HIGHBD_OBMC_SUBPIX_VAR(8, 4) + +HIGHBD_OBMC_VAR(8, 8) +HIGHBD_OBMC_SUBPIX_VAR(8, 8) + +HIGHBD_OBMC_VAR(8, 16) +HIGHBD_OBMC_SUBPIX_VAR(8, 16) + +HIGHBD_OBMC_VAR(16, 8) +HIGHBD_OBMC_SUBPIX_VAR(16, 8) + +HIGHBD_OBMC_VAR(16, 16) +HIGHBD_OBMC_SUBPIX_VAR(16, 16) + +HIGHBD_OBMC_VAR(16, 32) +HIGHBD_OBMC_SUBPIX_VAR(16, 32) + +HIGHBD_OBMC_VAR(32, 16) +HIGHBD_OBMC_SUBPIX_VAR(32, 16) + +HIGHBD_OBMC_VAR(32, 32) +HIGHBD_OBMC_SUBPIX_VAR(32, 32) + +HIGHBD_OBMC_VAR(32, 64) +HIGHBD_OBMC_SUBPIX_VAR(32, 64) + +HIGHBD_OBMC_VAR(64, 32) +HIGHBD_OBMC_SUBPIX_VAR(64, 32) + +HIGHBD_OBMC_VAR(64, 64) +HIGHBD_OBMC_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +HIGHBD_OBMC_VAR(64, 128) +HIGHBD_OBMC_SUBPIX_VAR(64, 128) + +HIGHBD_OBMC_VAR(128, 64) +HIGHBD_OBMC_SUBPIX_VAR(128, 64) + +HIGHBD_OBMC_VAR(128, 128) +HIGHBD_OBMC_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h new file mode 100644 index 000000000..7c925cfac --- /dev/null +++ b/third_party/aom/aom_dsp/variance.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_VARIANCE_H_ +#define AOM_DSP_VARIANCE_H_ + +#include "./aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 +#define FILTER_WEIGHT 128 + +typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); + +typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *second_pred); + +typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, + int b_stride, int n); + +typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sad_array); + +typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *const b_array[], + int b_stride, unsigned int *sad_array); + +typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred); + +#if CONFIG_AV1 && CONFIG_EXT_INTER +typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *msk_ptr, + int msk_stride); +typedef unsigned int (*aom_masked_variance_fn_t)( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *msk, int msk_stride, unsigned int *sse); +typedef unsigned int (*aom_masked_subpixvariance_fn_t)( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse); +#endif // CONFIG_AV1 && CONFIG_EXT_INTER + +#if CONFIG_AV1 && CONFIG_MOTION_VAR +typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, + const int32_t *wsrc, + const int32_t *msk); +typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred, + int pred_stride, + const int32_t *wsrc, + const int32_t *msk, + unsigned int *sse); +typedef unsigned int (*aom_obmc_subpixvariance_fn_t)( + const uint8_t *pred, int pred_stride, int xoffset, int yoffset, + const int32_t *wsrc, const int32_t *msk, unsigned int *sse); +#endif // CONFIG_AV1 && CONFIG_MOTION_VAR + +#if CONFIG_AV1 +typedef struct aom_variance_vtable { + aom_sad_fn_t sdf; + aom_sad_avg_fn_t sdaf; + aom_variance_fn_t vf; + aom_subpixvariance_fn_t svf; + aom_subp_avg_variance_fn_t svaf; + aom_sad_multi_fn_t sdx3f; + aom_sad_multi_fn_t sdx8f; + aom_sad_multi_d_fn_t sdx4df; +#if CONFIG_EXT_INTER + aom_masked_sad_fn_t msdf; + aom_masked_variance_fn_t mvf; + aom_masked_subpixvariance_fn_t msvf; +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR + aom_obmc_sad_fn_t osdf; + aom_obmc_variance_fn_t ovf; + aom_obmc_subpixvariance_fn_t osvf; +#endif // CONFIG_MOTION_VAR +} aom_variance_fn_ptr_t; +#endif // CONFIG_AV1 + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h); + +#if CONFIG_HIGHBITDEPTH +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h); +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_VARIANCE_H_ diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c new file mode 100644 index 000000000..4067b0b53 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" + +#if HAVE_SSE2 +filter8_1dfunction aom_filter_block1d16_v8_sse2; +filter8_1dfunction aom_filter_block1d16_h8_sse2; +filter8_1dfunction aom_filter_block1d8_v8_sse2; +filter8_1dfunction aom_filter_block1d8_h8_sse2; +filter8_1dfunction aom_filter_block1d4_v8_sse2; +filter8_1dfunction aom_filter_block1d4_h8_sse2; +filter8_1dfunction aom_filter_block1d16_v8_avg_sse2; +filter8_1dfunction aom_filter_block1d16_h8_avg_sse2; +filter8_1dfunction aom_filter_block1d8_v8_avg_sse2; +filter8_1dfunction aom_filter_block1d8_h8_avg_sse2; +filter8_1dfunction aom_filter_block1d4_v8_avg_sse2; +filter8_1dfunction aom_filter_block1d4_h8_avg_sse2; + +filter8_1dfunction aom_filter_block1d16_v2_sse2; +filter8_1dfunction aom_filter_block1d16_h2_sse2; +filter8_1dfunction aom_filter_block1d8_v2_sse2; +filter8_1dfunction aom_filter_block1d8_h2_sse2; +filter8_1dfunction aom_filter_block1d4_v2_sse2; +filter8_1dfunction aom_filter_block1d4_h2_sse2; +filter8_1dfunction aom_filter_block1d16_v2_avg_sse2; +filter8_1dfunction aom_filter_block1d16_h2_avg_sse2; +filter8_1dfunction aom_filter_block1d8_v2_avg_sse2; +filter8_1dfunction aom_filter_block1d8_h2_avg_sse2; +filter8_1dfunction aom_filter_block1d4_v2_avg_sse2; +filter8_1dfunction aom_filter_block1d4_h2_avg_sse2; + +// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); + +// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_, sse2); + +#if CONFIG_HIGHBITDEPTH && ARCH_X86_64 +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2; + +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2; + +// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + sse2); + +// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2); +HIGH_FUN_CONV_2D(avg_, sse2); + +#if CONFIG_LOOP_RESTORATION +// The SSE2 highbd convolve functions can deal with coefficients up to 32767. +// So redirect highbd_convolve8_add_src to regular highbd_convolve8. +void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + ((int16_t *)filter_x)[3] += 128; + ((int16_t *)filter_y)[3] += 128; + aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + ((int16_t *)filter_x)[3] -= 128; + ((int16_t *)filter_y)[3] -= 128; +} +#endif // CONFIG_LOOP_RESTORATION +#endif // CONFIG_HIGHBITDEPTH && ARCH_X86_64 +#endif // HAVE_SSE2 diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm new file mode 100644 index 000000000..4d3142867 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm @@ -0,0 +1,345 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1-2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif +%ifidn %2, highbd +%define pavg pavgw +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd +%else +%define pavg pavgb +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h +%endif + mov r4d, dword wm +%ifidn %2, highbd + shl r4d, 1 + shl srcq, 1 + shl src_strideq, 1 + shl dstq, 1 + shl dst_strideq, 1 +%else + cmp r4d, 4 + je .w4 +%endif + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 + +%if CONFIG_AV1 && CONFIG_EXT_PARTITION + cmp r4d, 64 + je .w64 +%ifidn %2, highbd + cmp r4d, 128 + je .w128 + +.w256: + mov r4d, dword hm +.loop256: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + movu m0, [srcq+128] + movu m1, [srcq+128+16] + movu m2, [srcq+128+32] + movu m3, [srcq+128+48] +%ifidn %1, avg + pavg m0, [dstq+128] + pavg m1, [dstq+128+16] + pavg m2, [dstq+128+32] + pavg m3, [dstq+128+48] +%endif + mova [dstq+128 ], m0 + mova [dstq+128+16], m1 + mova [dstq+128+32], m2 + mova [dstq+128+48], m3 + movu m0, [srcq+128+64] + movu m1, [srcq+128+80] + movu m2, [srcq+128+96] + movu m3, [srcq+128+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+128+64] + pavg m1, [dstq+128+80] + pavg m2, [dstq+128+96] + pavg m3, [dstq+128+112] +%endif + mova [dstq+128+64], m0 + mova [dstq+128+80], m1 + mova [dstq+128+96], m2 + mova [dstq+128+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop256 + RET +%endif + +.w128: + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop128 + RET + +%else ; CONFIG_AV1 && CONFIG_EXT_PARTITION + +%ifidn %2, highbd + cmp r4d, 64 + je .w64 + + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop128 + RET +%endif +%endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION + +.w64: + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq +16] + pavg m2, [dstq+dst_strideq] + pavg m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+dst_strideq] + pavg m2, [dstq+dst_strideq*2] + pavg m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +%ifnidn %2, highbd +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endif +%endmacro + +INIT_XMM sse2 +convolve_fn copy +convolve_fn avg +%if CONFIG_HIGHBITDEPTH +convolve_fn copy, highbd +convolve_fn avg, highbd +%endif diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm new file mode 100644 index 000000000..e6d357ba3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm @@ -0,0 +1,965 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 1, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 1, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..9e2ec748c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,497 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%if ARCH_X86_64 +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm8, rdx + movq xmm5, rcx + pshufd xmm8, xmm8, 0b + movdqa xmm1, xmm8 + psllw xmm8, xmm5 + psubw xmm8, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm9, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm9, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm9, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm9, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm9, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm9 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + pminsw xmm2, xmm8 + pmaxsw xmm2, xmm5 + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm +%endif + +global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c new file mode 100644 index 000000000..61476b8be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static void aom_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); + srcReg32b1 = _mm256_inserti128_si256( + srcReg32b1, + _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), + 1); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16( + srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); + srcReg32b2 = _mm256_inserti128_si256( + srcReg32b2, + _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), + 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16( + srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i *)(output_ptr + output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = + _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d16_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr += src_stride; + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i *)(output_ptr + out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = + _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +#if ARCH_X86_64 +filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; +#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3 +#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3 +#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction aom_filter_block1d8_v8_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_ssse3; +#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3 +#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3 +#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3 +#endif // ARCH_X86_64 +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; +#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 +#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 +#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 +#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 +#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 +#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 +#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 +// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000..be37738df --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,920 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +// These are reused by the avx2 intrinsics. +filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; + +void aom_filter_block1d4_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr += src_pixels_per_line; + + // save only 4 bytes + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_v8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + for (i = 0; i < output_height; i++) { + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pitch; + + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += out_pitch; + } +} + +filter8_1dfunction aom_filter_block1d16_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_ssse3; +filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3; +#if CONFIG_LOOP_RESTORATION +filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3; +#endif + +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; +filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3; + +// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); + +#if CONFIG_LOOP_RESTORATION +FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_, + ssse3); +FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v, + src - src_stride * 3, add_src_, ssse3); +#endif + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ + const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ + const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ + const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ + \ + const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ + const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ + const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ + const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ + \ + out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ + out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ + out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ + out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ + out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ + out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ + out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ + out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ + } + +static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); + const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); + const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); + const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); + const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); + // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 + const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); + // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 + const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); + // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 + const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); + // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 + const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); + // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 + const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); + // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 + const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); + // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); + const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); + const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); + const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride) { + __m128i A, B, C, D, E, F, G, H; + + A = _mm_loadl_epi64((const __m128i *)src); + B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); + F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); + G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); + H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); + + TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); + + _mm_storel_epi64((__m128i *)dst, A); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 8) { + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + transpose8x8_to_dst(temp, 8, dst + x, dst_stride); + } + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + // TRANSPOSE... + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // + // TO + // + // 00 10 20 30 + // 01 11 21 31 + // 02 12 22 32 + // 03 13 23 33 + // 04 14 24 34 + // 05 15 25 35 + // 06 16 26 36 + // 07 17 27 37 + // + // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 + const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); + // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 + const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); + // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 + const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 + const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); + // 02 03 12 13 22 23 32 33 + const __m128i s3s2 = _mm_srli_si128(s1s0, 8); + // 06 07 16 17 26 27 36 37 + const __m128i s7s6 = _mm_srli_si128(s5s4, 8); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride) { + __m128i A = _mm_cvtsi32_si128(*(const int *)src); + __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); + __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); + __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); + // 00 10 01 11 02 12 03 13 + const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); + // 20 30 21 31 22 32 23 33 + const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + A = _mm_unpacklo_epi16(tr0_0, tr0_1); + B = _mm_srli_si128(A, 4); + C = _mm_srli_si128(A, 8); + D = _mm_srli_si128(A, 12); + + *(int *)(dst) = _mm_cvtsi128_si32(A); + *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); + *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); + *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 4) { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + transpose4x4_to_dst(temp, 4, dst + x, dst_stride); + } + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); + const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); + const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); + const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); + const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); + const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); + const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); + const __m128i s1s0 = _mm_unpacklo_epi8(A, B); + const __m128i s3s2 = _mm_unpacklo_epi8(C, D); + const __m128i s5s4 = _mm_unpacklo_epi8(E, F); + const __m128i s7s6 = _mm_unpacklo_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + + y_q4 += y_step_q4; + } +} + +static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + const __m128i s1s0 = _mm_unpacklo_epi8(A, B); + const __m128i s3s2 = _mm_unpacklo_epi8(C, D); + const __m128i s5s4 = _mm_unpacklo_epi8(E, F); + const __m128i s7s6 = _mm_unpacklo_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter, int w) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + int i; + + for (i = 0; i < w; i += 16) { + const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + const __m128i E = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + const __m128i F = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + const __m128i G = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + const __m128i H = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + // merge the result together + const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); + const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); + const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); + const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); + const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); + const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); + const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); + // add and saturate the results together + const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); + const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); + // merge the result together + const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); + const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); + const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); + // merge the result together + const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); + const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); + const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); + // add and saturate the results together + __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); + __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); + + // add and saturate the results together + temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); + temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); + // round and shift by 7 bit each 16 bit + temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); + temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + temp_hi = _mm_packus_epi16(temp_lo, temp_hi); + src_ptr += 16; + // save 16 bytes convolve result + _mm_store_si128((__m128i *)&dst[i], temp_hi); + } +} + +static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); + } +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h); +} + +// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_, ssse3); +#if CONFIG_LOOP_RESTORATION +FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3); +#endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm new file mode 100644 index 000000000..b946010d3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm @@ -0,0 +1,990 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_v8_sse2) PRIVATE +sym(aom_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_v8_sse2) PRIVATE +sym(aom_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_v8_sse2) PRIVATE +sym(aom_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE +sym(aom_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE +sym(aom_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE +sym(aom_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 1, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_h8_sse2) PRIVATE +sym(aom_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_h8_sse2) PRIVATE +sym(aom_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_h8_sse2) PRIVATE +sym(aom_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE +sym(aom_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE +sym(aom_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE +sym(aom_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm new file mode 100644 index 000000000..357f37401 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -0,0 +1,883 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_64: times 8 dw 64 +even_byte_mask: times 8 dw 0x00ff + +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. +; +; The add order below (based on ffav1) must be followed to prevent outranges. +; x = k0k1 + k4k5 +; y = k2k3 + k6k7 +; z = signed SAT(x + y) + +SECTION .text +%define LOCAL_VARS_SIZE 16*6 + +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if ARCH_X86_64 + %define krd m12 + %define tmp0 [rsp + 16*4] + %define tmp1 [rsp + 16*5] + mova krd, [GLOBAL(pw_64)] +%else + %define krd [rsp + 16*4] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 +%endif + mova krd, m6 +%endif +%endm + +;------------------------------------------------------------------------------- +%if ARCH_X86_64 + %define LOCAL_VARS_SIZE_H4 0 +%else + %define LOCAL_VARS_SIZE_H4 16*4 +%endif + +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 +%else + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 +%endif + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 +%endif + dec heightd + +.loop: + ;Do two rows at once + movu m4, [srcq - 3] + movu m5, [srcq + sstrideq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + punpckhbw m3, m5, m5 + punpcklbw m5, m5 + palignr m0, m1, m4, 1 + pmaddubsw m0, k0k1k4k5 + palignr m1, m4, 5 + pmaddubsw m1, k2k3k6k7 + palignr m2, m3, m5, 1 + pmaddubsw m2, k0k1k4k5 + palignr m3, m5, 5 + pmaddubsw m3, k2k3k6k7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%ifidn %1, h8_avg + movd m4, [dstq] + movd m5, [dstq + dstrideq] +%endif + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 + psrldq m1, m0, 4 + +%ifidn %1, h8_avg + pavgb m0, m4 + pavgb m1, m5 +%endif + movd [dstq], m0 + movd [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m4, [srcq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + palignr m0, m1, m4, 1 + palignr m1, m4, 5 + pmaddubsw m0, k0k1k4k5 + pmaddubsw m1, k2k3k6k7 + psrldq m2, m0, 8 + psrldq m3, m1, 8 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, h8_avg + movd m4, [dstq] + pavgb m0, m4 +%endif + movd [dstq], m0 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + dec heightd + +.loop: + ;Do two rows at once + movu m0, [srcq - 3] + movu m4, [srcq + sstrideq - 3] + punpckhbw m1, m0, m0 + punpcklbw m0, m0 + palignr m5, m1, m0, 13 + pmaddubsw m5, k6k7 + palignr m2, m1, m0, 5 + palignr m3, m1, m0, 9 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpckhbw m6, m4, m4 + punpcklbw m4, m4 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m6, m4, 13 + palignr m0, m6, m4, 5 + pmaddubsw m7, k6k7 + paddsw m1, m3 + paddsw m2, m5 + paddsw m1, m2 +%ifidn %1, h8_avg + movh m2, [dstq] + movhps m2, [dstq + dstrideq] +%endif + palignr m5, m6, m4, 9 + palignr m6, m4, 1 + pmaddubsw m0, k2k3 + pmaddubsw m6, k0k1 + paddsw m1, krd + pmaddubsw m5, k4k5 + psraw m1, 7 + paddsw m0, m7 + paddsw m6, m5 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpcklbw m4, m3 + punpcklbw m5, m3 + paddsw m1, m4 + paddsw m6, m5 +%endif + packuswb m1, m6 +%ifidn %1, h8_avg + pavgb m1, m2 +%endif + movh [dstq], m1 + movhps [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m0, [srcq - 3] + punpckhbw m3, m0, m0 + punpcklbw m0, m0 + palignr m1, m3, m0, 1 + palignr m2, m3, m0, 5 + palignr m4, m3, m0, 13 + palignr m3, m0, 9 + pmaddubsw m1, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + pmaddubsw m4, k6k7 + paddsw m1, m3 + paddsw m4, m2 + paddsw m1, m4 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, h8_add_src + pxor m6, m6 + movu m5, [srcq] + punpcklbw m5, m6 + paddsw m1, m5 +%endif + packuswb m1, m1 +%ifidn %1, h8_avg + movh m0, [dstq] + pavgb m1, m0 +%endif + movh [dstq], m1 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movu m0, [srcq - 3] + movu m4, [srcq - 2] + pmaddubsw m0, k0k1 + pmaddubsw m4, k0k1 + movu m1, [srcq - 1] + movu m5, [srcq + 0] + pmaddubsw m1, k2k3 + pmaddubsw m5, k2k3 + movu m2, [srcq + 1] + movu m6, [srcq + 2] + pmaddubsw m2, k4k5 + pmaddubsw m6, k4k5 + movu m3, [srcq + 3] + movu m7, [srcq + 4] + pmaddubsw m3, k6k7 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m4, m6 + paddsw m5, m7 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 +%ifidn %1, h8_add_src + movu m5, [srcq] + mova m7, m5 + pand m5, [even_byte_mask] + psrlw m7, 8 + paddsw m0, m5 + paddsw m4, m7 +%endif + packuswb m0, m0 + packuswb m4, m4 + punpcklbw m0, m4 +%ifidn %1, h8_avg + pavgb m0, [dstq] +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightd + jnz .loop + REP_RET +%endm + +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 +SUBPIX_HFILTER16 h8_avg +SUBPIX_HFILTER8 h8 +SUBPIX_HFILTER8 h8_avg +SUBPIX_HFILTER4 h8 +SUBPIX_HFILTER4 h8_avg + +%if CONFIG_LOOP_RESTORATION +SUBPIX_HFILTER16 h8_add_src +SUBPIX_HFILTER8 h8_add_src +SUBPIX_HFILTER4 h8_add_src +%endif + +;------------------------------------------------------------------------------- + +; TODO(Linfeng): Detect cpu type and choose the code with better performance. +%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 + +%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + %define NUM_GENERAL_REG_USED 9 +%else + %define NUM_GENERAL_REG_USED 6 +%endif + +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif + + dec heightd + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + ;Do two rows at once + movx m0, [srcq ] ;A + movx m1, [src1q ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, v8_add_src + movu m4, [src1q] + punpcklbw m4, m6 + paddsw m1, m4 +%endif + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 +%endif + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 +%endif + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%else + ; ARCH_X86_64 + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2 ] + movx m2, [srcq] ;C + movx m3, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2 ] + movx m4, [srcq] ;E + movx m5, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2 ] + movx m6, [srcq] ;G + punpcklbw m0, m1 ;A B + punpcklbw m1, m2 ;A B next iter + punpcklbw m2, m3 ;C D + punpcklbw m3, m4 ;C D next iter + punpcklbw m4, m5 ;E F + punpcklbw m5, m6 ;E F next iter + +.loop: + ;Do two rows at once + movx m7, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2 ] + movx m14, [srcq] ;H next iter + punpcklbw m6, m7 ;G H + punpcklbw m7, m14 ;G H next iter + pmaddubsw m8, m0, k0k1 + pmaddubsw m9, m1, k0k1 + mova m0, m2 + mova m1, m3 + pmaddubsw m10, m2, k2k3 + pmaddubsw m11, m3, k2k3 + mova m2, m4 + mova m3, m5 + pmaddubsw m4, k4k5 + pmaddubsw m5, k4k5 + paddsw m8, m4 + paddsw m9, m5 + mova m4, m6 + mova m5, m7 + pmaddubsw m6, k6k7 + pmaddubsw m7, k6k7 + paddsw m10, m6 + paddsw m11, m7 + paddsw m8, m10 + paddsw m9, m11 + mova m6, m14 + paddsw m8, krd + paddsw m9, krd + psraw m8, 7 + psraw m9, 7 +%ifidn %2, 4 + packuswb m8, m8 + packuswb m9, m9 +%else + packuswb m8, m9 +%endif + +%ifidn %1, v8_avg + movx m7, [dstq] +%ifidn %2, 4 + movx m10, [dstq + dstrideq] + pavgb m9, m10 +%else + movhpd m7, [dstq + dstrideq] +%endif + pavgb m8, m7 +%endif + movx [dstq], m8 +%ifidn %2, 4 + movx [dstq + dstrideq], m9 +%else + movhpd [dstq + dstrideq], m8 +%endif + + lea dstq, [dstq + dstrideq * 2 ] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m7, [srcq + sstrideq] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + pmaddubsw m6, k6k7 + paddsw m0, m4 + paddsw m2, m6 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%endif ; ARCH_X86_64 + +.done: + REP_RET + +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + lea src1q, [srcq + sstrideq] + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + movh m0, [srcq ] ;A + movh m1, [src1q ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down + mova m5, m4 + punpcklbw m4, m6 + punpckhbw m5, m6 + paddsw m0, m4 + paddsw m3, m5 +%endif + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + REP_RET + +%else + ; ARCH_X86_64 + dec heightd + + movu m1, [srcq ] ;A + movu m3, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2] + punpcklbw m0, m1, m3 ;A B + punpckhbw m1, m3 ;A B + movu m5, [srcq] ;C + punpcklbw m2, m3, m5 ;A B next iter + punpckhbw m3, m5 ;A B next iter + mova tmp0, m2 ;store to stack + mova tmp1, m3 ;store to stack + movu m7, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2] + punpcklbw m4, m5, m7 ;C D + punpckhbw m5, m7 ;C D + movu m9, [srcq] ;E + punpcklbw m6, m7, m9 ;C D next iter + punpckhbw m7, m9 ;C D next iter + movu m11, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2] + punpcklbw m8, m9, m11 ;E F + punpckhbw m9, m11 ;E F + movu m2, [srcq] ;G + punpcklbw m10, m11, m2 ;E F next iter + punpckhbw m11, m2 ;E F next iter + +.loop: + ;Do two rows at once + pmaddubsw m13, m0, k0k1 + mova m0, m4 + pmaddubsw m14, m8, k4k5 + pmaddubsw m15, m4, k2k3 + mova m4, m8 + paddsw m13, m14 + movu m3, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2] + punpcklbw m14, m2, m3 ;G H + mova m8, m14 + pmaddubsw m14, k6k7 + paddsw m15, m14 + paddsw m13, m15 + paddsw m13, krd + psraw m13, 7 + + pmaddubsw m14, m1, k0k1 + pmaddubsw m1, m9, k4k5 + pmaddubsw m15, m5, k2k3 + paddsw m14, m1 + mova m1, m5 + mova m5, m9 + punpckhbw m2, m3 ;G H + mova m9, m2 + pmaddubsw m2, k6k7 + paddsw m15, m2 + paddsw m14, m15 + paddsw m14, krd + psraw m14, 7 + packuswb m13, m14 +%ifidn %1, v8_avg + pavgb m13, [dstq] +%endif + mova [dstq], m13 + + ; next iter + pmaddubsw m15, tmp0, k0k1 + pmaddubsw m14, m10, k4k5 + pmaddubsw m13, m6, k2k3 + paddsw m15, m14 + mova tmp0, m6 + mova m6, m10 + movu m2, [srcq] ;G next iter + punpcklbw m14, m3, m2 ;G H next iter + mova m10, m14 + pmaddubsw m14, k6k7 + paddsw m13, m14 + paddsw m15, m13 + paddsw m15, krd + psraw m15, 7 + + pmaddubsw m14, tmp1, k0k1 + mova tmp1, m7 + pmaddubsw m13, m7, k2k3 + mova m7, m11 + pmaddubsw m11, k4k5 + paddsw m14, m11 + punpckhbw m3, m2 ;G H next iter + mova m11, m3 + pmaddubsw m3, k6k7 + paddsw m13, m3 + paddsw m14, m13 + paddsw m14, krd + psraw m14, 7 + packuswb m15, m14 +%ifidn %1, v8_avg + pavgb m15, [dstq + dstrideq] +%endif + mova [dstq + dstrideq], m15 + lea dstq, [dstq + dstrideq * 2] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m3, [srcq + sstrideq] ;H + punpcklbw m6, m2, m3 ;G H + punpckhbw m2, m3 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m1, k0k1 + pmaddubsw m4, k2k3 + pmaddubsw m5, k2k3 + pmaddubsw m8, k4k5 + pmaddubsw m9, k4k5 + pmaddubsw m6, k6k7 + pmaddubsw m2, k6k7 + paddsw m0, m8 + paddsw m1, m9 + paddsw m4, m6 + paddsw m5, m2 + paddsw m0, m4 + paddsw m1, m5 + paddsw m0, krd + paddsw m1, krd + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + +.done: + REP_RET + +%endif ; ARCH_X86_64 + +%endm + +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 +SUBPIX_VFILTER16 v8_avg +SUBPIX_VFILTER v8, 8 +SUBPIX_VFILTER v8_avg, 8 +SUBPIX_VFILTER v8, 4 +SUBPIX_VFILTER v8_avg, 4 + +%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \ + CONFIG_LOOP_RESTORATION +SUBPIX_VFILTER16 v8_add_src +SUBPIX_VFILTER v8_add_src, 8 +SUBPIX_VFILTER v8_add_src, 4 +%endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..8f025a8be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm @@ -0,0 +1,451 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(aom_filter_block1d4_v2_sse2) PRIVATE +sym(aom_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_sse2) PRIVATE +sym(aom_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_sse2) PRIVATE +sym(aom_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE +sym(aom_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE +sym(aom_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE +sym(aom_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_sse2) PRIVATE +sym(aom_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_sse2) PRIVATE +sym(aom_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_sse2) PRIVATE +sym(aom_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE +sym(aom_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE +sym(aom_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE +sym(aom_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000..b9b2da0be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm @@ -0,0 +1,421 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movd xmm2, ecx ;rounding_shift + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movd xmm6, ecx ;rounding_shift + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + pmulhrsw xmm2, xmm6 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(aom_filter_block1d4_v2_ssse3) PRIVATE +sym(aom_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_ssse3) PRIVATE +sym(aom_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_ssse3) PRIVATE +sym(aom_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(aom_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(aom_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(aom_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_ssse3) PRIVATE +sym(aom_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_ssse3) PRIVATE +sym(aom_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_ssse3) PRIVATE +sym(aom_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(aom_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(aom_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(aom_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c new file mode 100644 index 000000000..bcdc20f63 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} + +unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 32) >> 6; +} + +unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0); + s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 2 * p), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 3 * p), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 8) >> 4; +} + +static void hadamard_col8_sse2(__m128i *in, int iter) { + __m128i a0 = in[0]; + __m128i a1 = in[1]; + __m128i a2 = in[2]; + __m128i a3 = in[3]; + __m128i a4 = in[4]; + __m128i a5 = in[5]; + __m128i a6 = in[6]; + __m128i a7 = in[7]; + + __m128i b0 = _mm_add_epi16(a0, a1); + __m128i b1 = _mm_sub_epi16(a0, a1); + __m128i b2 = _mm_add_epi16(a2, a3); + __m128i b3 = _mm_sub_epi16(a2, a3); + __m128i b4 = _mm_add_epi16(a4, a5); + __m128i b5 = _mm_sub_epi16(a4, a5); + __m128i b6 = _mm_add_epi16(a6, a7); + __m128i b7 = _mm_sub_epi16(a6, a7); + + a0 = _mm_add_epi16(b0, b2); + a1 = _mm_add_epi16(b1, b3); + a2 = _mm_sub_epi16(b0, b2); + a3 = _mm_sub_epi16(b1, b3); + a4 = _mm_add_epi16(b4, b6); + a5 = _mm_add_epi16(b5, b7); + a6 = _mm_sub_epi16(b4, b6); + a7 = _mm_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm_add_epi16(a0, a4); + b7 = _mm_add_epi16(a1, a5); + b3 = _mm_add_epi16(a2, a6); + b4 = _mm_add_epi16(a3, a7); + b2 = _mm_sub_epi16(a0, a4); + b6 = _mm_sub_epi16(a1, a5); + b1 = _mm_sub_epi16(a2, a6); + b5 = _mm_sub_epi16(a3, a7); + + a0 = _mm_unpacklo_epi16(b0, b1); + a1 = _mm_unpacklo_epi16(b2, b3); + a2 = _mm_unpackhi_epi16(b0, b1); + a3 = _mm_unpackhi_epi16(b2, b3); + a4 = _mm_unpacklo_epi16(b4, b5); + a5 = _mm_unpacklo_epi16(b6, b7); + a6 = _mm_unpackhi_epi16(b4, b5); + a7 = _mm_unpackhi_epi16(b6, b7); + + b0 = _mm_unpacklo_epi32(a0, a1); + b1 = _mm_unpacklo_epi32(a4, a5); + b2 = _mm_unpackhi_epi32(a0, a1); + b3 = _mm_unpackhi_epi32(a4, a5); + b4 = _mm_unpacklo_epi32(a2, a3); + b5 = _mm_unpacklo_epi32(a6, a7); + b6 = _mm_unpackhi_epi32(a2, a3); + b7 = _mm_unpackhi_epi32(a6, a7); + + in[0] = _mm_unpacklo_epi64(b0, b1); + in[1] = _mm_unpackhi_epi64(b0, b1); + in[2] = _mm_unpacklo_epi64(b2, b3); + in[3] = _mm_unpackhi_epi64(b2, b3); + in[4] = _mm_unpacklo_epi64(b4, b5); + in[5] = _mm_unpackhi_epi64(b4, b5); + in[6] = _mm_unpacklo_epi64(b6, b7); + in[7] = _mm_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm_add_epi16(a0, a4); + in[7] = _mm_add_epi16(a1, a5); + in[3] = _mm_add_epi16(a2, a6); + in[4] = _mm_add_epi16(a3, a7); + in[2] = _mm_sub_epi16(a0, a4); + in[6] = _mm_sub_epi16(a1, a5); + in[1] = _mm_sub_epi16(a2, a6); + in[5] = _mm_sub_epi16(a3, a7); + } +} + +void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + _mm_store_si128((__m128i *)coeff, src[0]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[1]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[2]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[3]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[4]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[5]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[6]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[7]); +} + +void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + int16_t const *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + _mm_store_si128((__m128i *)coeff, coeff0); + _mm_store_si128((__m128i *)(coeff + 64), coeff1); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + _mm_store_si128((__m128i *)(coeff + 128), coeff2); + _mm_store_si128((__m128i *)(coeff + 192), coeff3); + + coeff += 8; + } +} + +int aom_satd_sse2(const int16_t *coeff, int length) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; + + for (i = 0; i < length; i += 8) { + const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i inv = _mm_sub_epi16(zero, src_line); + const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) + const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); + const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); + const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); + accum = _mm_add_epi32(accum, sum); + coeff += 8; + } + + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } + + return _mm_cvtsi128_si32(accum); +} + +void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride, + int height) { + int idx; + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_unpacklo_epi8(src_line, zero); + __m128i s1 = _mm_unpackhi_epi8(src_line, zero); + __m128i t0, t1; + int height_1 = height - 1; + ref += ref_stride; + + for (idx = 1; idx < height_1; idx += 2) { + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + } + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + + if (height == 64) { + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + } else if (height == 32) { + s0 = _mm_srai_epi16(s0, 4); + s1 = _mm_srai_epi16(s1, 4); + } else { + s0 = _mm_srai_epi16(s0, 3); + s1 = _mm_srai_epi16(s1, 3); + } + + _mm_storeu_si128((__m128i *)hbuf, s0); + hbuf += 8; + _mm_storeu_si128((__m128i *)hbuf, s1); +} + +int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) { + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_load_si128((const __m128i *)ref); + __m128i s0 = _mm_sad_epu8(src_line, zero); + __m128i s1; + int i; + + for (i = 16; i < width; i += 16) { + ref += 16; + src_line = _mm_load_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + } + + s1 = _mm_srli_si128(s0, 8); + s0 = _mm_adds_epu16(s0, s1); + + return _mm_extract_epi16(s0, 0); +} + +int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) { + int idx; + int width = 4 << bwl; + int16_t mean; + __m128i v0 = _mm_loadu_si128((const __m128i *)ref); + __m128i v1 = _mm_load_si128((const __m128i *)src); + __m128i diff = _mm_subs_epi16(v0, v1); + __m128i sum = diff; + __m128i sse = _mm_madd_epi16(diff, diff); + + ref += 8; + src += 8; + + for (idx = 8; idx < width; idx += 8) { + v0 = _mm_loadu_si128((const __m128i *)ref); + v1 = _mm_load_si128((const __m128i *)src); + diff = _mm_subs_epi16(v0, v1); + + sum = _mm_add_epi16(sum, diff); + v0 = _mm_madd_epi16(diff, diff); + sse = _mm_add_epi32(sse, v0); + + ref += 8; + src += 8; + } + + v0 = _mm_srli_si128(sum, 8); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi64(sum, 32); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi32(sum, 16); + sum = _mm_add_epi16(sum, v0); + + v1 = _mm_srli_si128(sse, 8); + sse = _mm_add_epi32(sse, v1); + v1 = _mm_srli_epi64(sse, 32); + sse = _mm_add_epi32(sse, v1); + + mean = _mm_extract_epi16(sum, 0); + + return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); +} diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm new file mode 100644 index 000000000..b2d150296 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm @@ -0,0 +1,124 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%define private_prefix aom + +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the hadamard transformation. Part +; of the macro definitions are originally derived from the ffmpeg project. +; The current version applies to x86 64-bit only. + +SECTION .text + +%if ARCH_X86_64 +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro HMD8_1D 0 + psubw m8, m0, m1 + psubw m9, m2, m3 + paddw m0, m1 + paddw m2, m3 + SWAP 1, 8 + SWAP 3, 9 + psubw m8, m4, m5 + psubw m9, m6, m7 + paddw m4, m5 + paddw m6, m7 + SWAP 5, 8 + SWAP 7, 9 + + psubw m8, m0, m2 + psubw m9, m1, m3 + paddw m0, m2 + paddw m1, m3 + SWAP 2, 8 + SWAP 3, 9 + psubw m8, m4, m6 + psubw m9, m5, m7 + paddw m4, m6 + paddw m5, m7 + SWAP 6, 8 + SWAP 7, 9 + + psubw m8, m0, m4 + psubw m9, m1, m5 + paddw m0, m4 + paddw m1, m5 + SWAP 4, 8 + SWAP 5, 9 + psubw m8, m2, m6 + psubw m9, m3, m7 + paddw m2, m6 + paddw m3, m7 + SWAP 6, 8 + SWAP 7, 9 +%endmacro + +INIT_XMM ssse3 +cglobal hadamard_8x8, 3, 5, 10, input, stride, output + lea r3, [2 * strideq] + lea r4, [4 * strideq] + + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + HMD8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + HMD8_1D + + mova [outputq + 0], m0 + mova [outputq + 16], m1 + mova [outputq + 32], m2 + mova [outputq + 48], m3 + mova [outputq + 64], m4 + mova [outputq + 80], m5 + mova [outputq + 96], m6 + mova [outputq + 112], m7 + + RET +%endif diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c new file mode 100644 index 000000000..e916e4ff9 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_integer.h" + +#include "./aom_dsp_rtcd.h" + +// To start out, just dispatch to the function using the 2D mask and +// pass mask stride as 0. This can be improved upon if necessary. + +void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, 0, h, w, 0, 0); +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, + src1_8, src1_stride, mask, 0, h, w, 0, 0, + bd); +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c new file mode 100644 index 000000000..68d74e517 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -0,0 +1,924 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" + +#include "./aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_m0l_b = xx_loadl_64(mask + c); + const __m128i v_m0h_b = xx_loadl_64(mask + c + 8); + const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b); + const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_rl_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1)); + const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1)); + + const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b); + const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zero = _mm_setzero_si128(); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ra_b = xx_loadu_128(mask + c); + const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); + const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); + const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); + const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); + const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); + const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); + const __m128i v_rvsbl_w = + _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); + const __m128i v_rvsbh_w = + _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); + const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); + const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + int w, int suby, int subx) { + typedef void (*blend_fn)( + uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w); + + // Dimensions are: width_index X subx X suby + static const blend_fn blend[3][2][2] = { + { // w % 16 == 0 + { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, + { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, + { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, + { // w == 8 + { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, + { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, h, w, suby, subx); + } else { + blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, + src0_stride, src1, src1_stride, + mask, mask_stride, h, w); + } +} + +#if CONFIG_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b10); +} + +static void blend_a64_mask_b12_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_m0_b = xx_loadl_64(mask + c); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_r_b = xx_loadu_128(mask + 2 * c); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadl_64(mask + c); + const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int h, int w, + int suby, int subx, int bd) { + typedef void (*blend_fn)( + uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w); + + // Dimensions are: bd_index X width_index X subx X suby + static const blend_fn blend[2][2][2][2] = { + { // bd == 8 or 10 + { // w % 8 == 0 + { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, + { blend_a64_mask_b10_sx_w8n_sse4_1, + blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, + { blend_a64_mask_b10_sx_w4_sse4_1, + blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, + { // bd == 12 + { // w % 8 == 0 + { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, + { blend_a64_mask_b12_sx_w8n_sse4_1, + blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, + { blend_a64_mask_b12_sx_w4_sse4_1, + blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, mask_stride, h, w, suby, + subx, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w); + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c new file mode 100644 index 000000000..9dabe5b79 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" + +#include "./aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, + uint32_t src0_stride, + const uint8_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 16) { + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); + + // Dimension: width_index + static const blend_fn blend[9] = { + blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 + aom_blend_a64_vmask_c, // w == 1 + aom_blend_a64_vmask_c, // w == 2 + NULL, // INVALID + blend_a64_vmask_w4_sse4_1, // w == 4 + NULL, // INVALID + NULL, // INVALID + NULL, // INVALID + blend_a64_vmask_w8_sse4_1, // w == 8 + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, + w); +} + +#if CONFIG_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_vmask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b10); +} + +static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b12); +} + +static INLINE void blend_a64_vmask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 8) { + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w, blend_8_b10); +} + +static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w, blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_vmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); + + // Dimensions are: bd_index X width_index + static const blend_fn blend[2][2] = { + { + // bd == 8 or 10 + blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b10_w4_sse4_1, // w == 4 + }, + { + // bd == 12 + blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b12_w4_sse4_1, // w == 4 + } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, h, w, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w); + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h new file mode 100644 index 000000000..daa2b2b3a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_DSP_X86_BLEND_SSE4_H_ + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" + +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +#if CONFIG_HIGHBITDEPTH +typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w); + +static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + // Interleave + const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); + + // Scale + const __m128i v_ssum_d = + _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + // Interleave + const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); + const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); + const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); + + // Scale + const __m128i v_ssuml_d = + _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssumh_d = + _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} +#endif // CONFIG_HIGHBITDEPTH + +#endif // AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h new file mode 100644 index 000000000..8641164db --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_DSP_X86_CONVOLVE_H_ +#define AOM_DSP_X86_CONVOLVE_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_convolve.h" + +typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, const int16_t *filter); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } + +#define FUN_CONV_2D(avg, opt) \ + void aom_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ + assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] || \ + filter_y[1] || filter_y[2]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, \ + MAX_SB_SIZE, filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h + 7); \ + aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ + dst, dst_stride, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ + aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h + 1); \ + aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } + +#if CONFIG_LOOP_RESTORATION +// convolve_add_src is only used by the Wiener filter, which will never +// end up calling the bilinear functions (it uses a symmetric filter, so +// the possible numbers of taps are 1,3,5,7) +#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \ + opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } + +#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \ + void aom_convolve8_##type##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ + assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + aom_convolve8_##htype##horiz_##opt( \ + src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h + 7); \ + aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ + dst, dst_stride, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h); \ + } +#endif + +#if CONFIG_HIGHBITDEPTH +typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, int bd); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_highbd_convolve8_##name##_##opt( \ + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + aom_highbd_convolve8_##name##_c( \ + CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } + +#define HIGH_FUN_CONV_2D(avg, opt) \ + void aom_highbd_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, \ + fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), \ + MAX_SB_SIZE, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h + 7, bd); \ + aom_highbd_convolve8_##avg##vert_##opt( \ + CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst, \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, \ + fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ + aom_highbd_convolve8_horiz_##opt( \ + src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ + aom_highbd_convolve8_##avg##vert_##opt( \ + CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } else { \ + aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } +#endif // CONFIG_HIGHBITDEPTH + +#endif // AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c new file mode 100644 index 000000000..b8ec08de7 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c @@ -0,0 +1,862 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "aom_dsp/fwd_txfm.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +// Apply a 32-element IDCT to 8 columns. This does not do any transposition +// of its output - the caller is expected to do that. +// The input buffers are the top and bottom halves of an 8x32 block. +void fdct32_8col(__m128i *in0, __m128i *in1) { + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i step1[32]; + __m128i step2[32]; + __m128i step3[32]; + __m128i out[32]; + // Stage 1 + { + const __m128i *ina = in0; + const __m128i *inb = in1 + 15; + __m128i *step1a = &step1[0]; + __m128i *step1b = &step1[31]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + { + const __m128i *ina = in0 + 4; + const __m128i *inb = in1 + 11; + __m128i *step1a = &step1[4]; + __m128i *step1b = &step1[27]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + { + const __m128i *ina = in0 + 8; + const __m128i *inb = in1 + 7; + __m128i *step1a = &step1[8]; + __m128i *step1b = &step1[23]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + { + const __m128i *ina = in0 + 12; + const __m128i *inb = in1 + 3; + __m128i *step1a = &step1[12]; + __m128i *step1b = &step1[19]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + // Stage 2 + { + step2[0] = _mm_add_epi16(step1[0], step1[15]); + step2[1] = _mm_add_epi16(step1[1], step1[14]); + step2[2] = _mm_add_epi16(step1[2], step1[13]); + step2[3] = _mm_add_epi16(step1[3], step1[12]); + step2[4] = _mm_add_epi16(step1[4], step1[11]); + step2[5] = _mm_add_epi16(step1[5], step1[10]); + step2[6] = _mm_add_epi16(step1[6], step1[9]); + step2[7] = _mm_add_epi16(step1[7], step1[8]); + step2[8] = _mm_sub_epi16(step1[7], step1[8]); + step2[9] = _mm_sub_epi16(step1[6], step1[9]); + step2[10] = _mm_sub_epi16(step1[5], step1[10]); + step2[11] = _mm_sub_epi16(step1[4], step1[11]); + step2[12] = _mm_sub_epi16(step1[3], step1[12]); + step2[13] = _mm_sub_epi16(step1[2], step1[13]); + step2[14] = _mm_sub_epi16(step1[1], step1[14]); + step2[15] = _mm_sub_epi16(step1[0], step1[15]); + } + { + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); + } + // Stage 3 + { + step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm_add_epi16(step2[23], step1[16]); + step3[17] = _mm_add_epi16(step2[22], step1[17]); + step3[18] = _mm_add_epi16(step2[21], step1[18]); + step3[19] = _mm_add_epi16(step2[20], step1[19]); + step3[20] = _mm_sub_epi16(step1[19], step2[20]); + step3[21] = _mm_sub_epi16(step1[18], step2[21]); + step3[22] = _mm_sub_epi16(step1[17], step2[22]); + step3[23] = _mm_sub_epi16(step1[16], step2[23]); + step3[24] = _mm_sub_epi16(step1[31], step2[24]); + step3[25] = _mm_sub_epi16(step1[30], step2[25]); + step3[26] = _mm_sub_epi16(step1[29], step2[26]); + step3[27] = _mm_sub_epi16(step1[28], step2[27]); + step3[28] = _mm_add_epi16(step2[27], step1[28]); + step3[29] = _mm_add_epi16(step2[26], step1[29]); + step3[30] = _mm_add_epi16(step2[25], step1[30]); + step3[31] = _mm_add_epi16(step2[24], step1[31]); + } + + // Stage 4 + { + step1[0] = _mm_add_epi16(step3[3], step3[0]); + step1[1] = _mm_add_epi16(step3[2], step3[1]); + step1[2] = _mm_sub_epi16(step3[1], step3[2]); + step1[3] = _mm_sub_epi16(step3[0], step3[3]); + step1[8] = _mm_add_epi16(step3[11], step2[8]); + step1[9] = _mm_add_epi16(step3[10], step2[9]); + step1[10] = _mm_sub_epi16(step2[9], step3[10]); + step1[11] = _mm_sub_epi16(step2[8], step3[11]); + step1[12] = _mm_sub_epi16(step2[15], step3[12]); + step1[13] = _mm_sub_epi16(step2[14], step3[13]); + step1[14] = _mm_add_epi16(step3[13], step2[14]); + step1[15] = _mm_add_epi16(step3[12], step2[15]); + } + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm_add_epi16(step1[5], step3[4]); + step2[5] = _mm_sub_epi16(step3[4], step1[5]); + step2[6] = _mm_sub_epi16(step3[7], step1[6]); + step2[7] = _mm_add_epi16(step1[6], step3[7]); + } + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); + } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm_add_epi16(step1[19], step3[16]); + step2[17] = _mm_add_epi16(step1[18], step3[17]); + step2[18] = _mm_sub_epi16(step3[17], step1[18]); + step2[19] = _mm_sub_epi16(step3[16], step1[19]); + step2[20] = _mm_sub_epi16(step3[23], step1[20]); + step2[21] = _mm_sub_epi16(step3[22], step1[21]); + step2[22] = _mm_add_epi16(step1[21], step3[22]); + step2[23] = _mm_add_epi16(step1[20], step3[23]); + step2[24] = _mm_add_epi16(step1[27], step3[24]); + step2[25] = _mm_add_epi16(step1[26], step3[25]); + step2[26] = _mm_sub_epi16(step3[25], step1[26]); + step2[27] = _mm_sub_epi16(step3[24], step1[27]); + step2[28] = _mm_sub_epi16(step3[31], step1[28]); + step2[29] = _mm_sub_epi16(step3[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step3[30]); + step2[31] = _mm_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); + } + { + step3[8] = _mm_add_epi16(step2[9], step1[8]); + step3[9] = _mm_sub_epi16(step1[8], step2[9]); + step3[10] = _mm_sub_epi16(step1[11], step2[10]); + step3[11] = _mm_add_epi16(step2[10], step1[11]); + step3[12] = _mm_add_epi16(step2[13], step1[12]); + step3[13] = _mm_sub_epi16(step1[12], step2[13]); + step3[14] = _mm_sub_epi16(step1[15], step2[14]); + step3[15] = _mm_add_epi16(step2[14], step1[15]); + } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm_add_epi16(step3[17], step2[16]); + step1[17] = _mm_sub_epi16(step2[16], step3[17]); + step1[18] = _mm_sub_epi16(step2[19], step3[18]); + step1[19] = _mm_add_epi16(step3[18], step2[19]); + step1[20] = _mm_add_epi16(step3[21], step2[20]); + step1[21] = _mm_sub_epi16(step2[20], step3[21]); + step1[22] = _mm_sub_epi16(step2[23], step3[22]); + step1[23] = _mm_add_epi16(step3[22], step2[23]); + step1[24] = _mm_add_epi16(step3[25], step2[24]); + step1[25] = _mm_sub_epi16(step2[24], step3[25]); + step1[26] = _mm_sub_epi16(step2[27], step3[26]); + step1[27] = _mm_add_epi16(step3[26], step2[27]); + step1[28] = _mm_add_epi16(step3[29], step2[28]); + step1[29] = _mm_sub_epi16(step2[28], step3[29]); + step1[30] = _mm_sub_epi16(step2[31], step3[30]); + step1[31] = _mm_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); + } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); + } + + // Output results + { + int j; + for (j = 0; j < 16; ++j) { + _mm_storeu_si128((__m128i *)(in0 + j), out[j]); + _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]); + } + } +} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h new file mode 100644 index 000000000..216739581 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -0,0 +1,3022 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_intrin.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +#if FDCT32x32_HIGH_PRECISION +static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { + __m256i buf0, buf1; + buf0 = _mm256_mul_epu32(a, b); + a = _mm256_srli_epi64(a, 32); + b = _mm256_srli_epi64(b, 32); + buf1 = _mm256_mul_epu32(a, b); + return _mm256_add_epi64(buf0, buf1); +} + +static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) { + __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm256_unpacklo_epi64(buf0, buf1); +} +#endif + +#ifndef STORE_COEFF_FUNC +#define STORE_COEFF_FUNC +static void store_coeff(const __m256i *coeff, tran_low_t *curr, + tran_low_t *next) { + __m128i u = _mm256_castsi256_si128(*coeff); + storeu_output(&u, curr); + u = _mm256_extractf128_si256(*coeff, 1); + storeu_output(&u, next); +} +#endif + +void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org, + int stride) { + // Calculate pre-multiplied strides + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i k__cospi_p16_m16 = + pair256_set_epi16(+cospi_16_64, -cospi_16_64); + const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); + const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); + const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i k__cospi_m12_m20 = + pair256_set_epi16(-cospi_12_64, -cospi_20_64); + const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); + const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); + const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); + const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); + const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); + const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); + const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); + const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); + const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); + const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); + const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); + const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); + const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); + const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); + const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); + const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); + const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); + const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); + const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + const __m256i kZero = _mm256_set1_epi16(0); + const __m256i kOne = _mm256_set1_epi16(1); + // Do the two transform/transpose passes + int pass; + for (pass = 0; pass < 2; ++pass) { + // We process sixteen columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 16) { + __m256i step1[32]; + __m256i step2[32]; + __m256i step3[32]; + __m256i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + const int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m256i *step1a = &step1[0]; + __m256i *step1b = &step1[31]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m256i *step1a = &step1[4]; + __m256i *step1b = &step1[27]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m256i *step1a = &step1[8]; + __m256i *step1b = &step1[23]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; + __m256i *step1a = &step1[12]; + __m256i *step1b = &step1[19]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); + __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); + __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); + __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); + __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); + __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); + __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); + __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); + step1[0] = _mm256_add_epi16(in00, in31); + step1[1] = _mm256_add_epi16(in01, in30); + step1[2] = _mm256_add_epi16(in02, in29); + step1[3] = _mm256_add_epi16(in03, in28); + step1[28] = _mm256_sub_epi16(in03, in28); + step1[29] = _mm256_sub_epi16(in02, in29); + step1[30] = _mm256_sub_epi16(in01, in30); + step1[31] = _mm256_sub_epi16(in00, in31); + } + { + __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); + __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); + __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); + __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); + __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); + __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); + __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); + __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); + step1[4] = _mm256_add_epi16(in04, in27); + step1[5] = _mm256_add_epi16(in05, in26); + step1[6] = _mm256_add_epi16(in06, in25); + step1[7] = _mm256_add_epi16(in07, in24); + step1[24] = _mm256_sub_epi16(in07, in24); + step1[25] = _mm256_sub_epi16(in06, in25); + step1[26] = _mm256_sub_epi16(in05, in26); + step1[27] = _mm256_sub_epi16(in04, in27); + } + { + __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); + __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); + __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); + __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); + __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); + __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); + __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); + __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); + step1[8] = _mm256_add_epi16(in08, in23); + step1[9] = _mm256_add_epi16(in09, in22); + step1[10] = _mm256_add_epi16(in10, in21); + step1[11] = _mm256_add_epi16(in11, in20); + step1[20] = _mm256_sub_epi16(in11, in20); + step1[21] = _mm256_sub_epi16(in10, in21); + step1[22] = _mm256_sub_epi16(in09, in22); + step1[23] = _mm256_sub_epi16(in08, in23); + } + { + __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); + __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); + __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); + __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); + __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); + __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); + __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); + __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); + step1[12] = _mm256_add_epi16(in12, in19); + step1[13] = _mm256_add_epi16(in13, in18); + step1[14] = _mm256_add_epi16(in14, in17); + step1[15] = _mm256_add_epi16(in15, in16); + step1[16] = _mm256_sub_epi16(in15, in16); + step1[17] = _mm256_sub_epi16(in14, in17); + step1[18] = _mm256_sub_epi16(in13, in18); + step1[19] = _mm256_sub_epi16(in12, in19); + } + } + // Stage 2 + { + step2[0] = _mm256_add_epi16(step1[0], step1[15]); + step2[1] = _mm256_add_epi16(step1[1], step1[14]); + step2[2] = _mm256_add_epi16(step1[2], step1[13]); + step2[3] = _mm256_add_epi16(step1[3], step1[12]); + step2[4] = _mm256_add_epi16(step1[4], step1[11]); + step2[5] = _mm256_add_epi16(step1[5], step1[10]); + step2[6] = _mm256_add_epi16(step1[6], step1[9]); + step2[7] = _mm256_add_epi16(step1[7], step1[8]); + step2[8] = _mm256_sub_epi16(step1[7], step1[8]); + step2[9] = _mm256_sub_epi16(step1[6], step1[9]); + step2[10] = _mm256_sub_epi16(step1[5], step1[10]); + step2[11] = _mm256_sub_epi16(step1[4], step1[11]); + step2[12] = _mm256_sub_epi16(step1[3], step1[12]); + step2[13] = _mm256_sub_epi16(step1[2], step1[13]); + step2[14] = _mm256_sub_epi16(step1[1], step1[14]); + step2[15] = _mm256_sub_epi16(step1[0], step1[15]); + } + { + const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]); + const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]); + const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]); + const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]); + const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]); + const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]); + const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]); + const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]); + const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s2_20_4 = + _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m256i s2_20_5 = + _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m256i s2_21_4 = + _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m256i s2_21_5 = + _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m256i s2_22_4 = + _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m256i s2_22_5 = + _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m256i s2_23_4 = + _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m256i s2_23_5 = + _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m256i s2_24_4 = + _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m256i s2_24_5 = + _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m256i s2_25_4 = + _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m256i s2_25_5 = + _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m256i s2_26_4 = + _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m256i s2_26_5 = + _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m256i s2_27_4 = + _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m256i s2_27_5 = + _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7); + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]); + __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]); + __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]); + __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]); + __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]); + __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]); + __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]); + __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]); + __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]); + __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]); + __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]); + __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]); + __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]); + __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]); + __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]); + __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]); + __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]); + __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]); + __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]); + __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]); + __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]); + __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]); + __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]); + __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]); + __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]); + __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]); + __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]); + __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]); + __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]); + __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]); + __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]); + __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]); + + step2[0] = _mm256_sub_epi16(step2[0], s3_00_0); + step2[1] = _mm256_sub_epi16(step2[1], s3_01_0); + step2[2] = _mm256_sub_epi16(step2[2], s3_02_0); + step2[3] = _mm256_sub_epi16(step2[3], s3_03_0); + step2[4] = _mm256_sub_epi16(step2[4], s3_04_0); + step2[5] = _mm256_sub_epi16(step2[5], s3_05_0); + step2[6] = _mm256_sub_epi16(step2[6], s3_06_0); + step2[7] = _mm256_sub_epi16(step2[7], s3_07_0); + step2[8] = _mm256_sub_epi16(step2[8], s2_08_0); + step2[9] = _mm256_sub_epi16(step2[9], s2_09_0); + step2[10] = _mm256_sub_epi16(step2[10], s3_10_0); + step2[11] = _mm256_sub_epi16(step2[11], s3_11_0); + step2[12] = _mm256_sub_epi16(step2[12], s3_12_0); + step2[13] = _mm256_sub_epi16(step2[13], s3_13_0); + step2[14] = _mm256_sub_epi16(step2[14], s2_14_0); + step2[15] = _mm256_sub_epi16(step2[15], s2_15_0); + step1[16] = _mm256_sub_epi16(step1[16], s3_16_0); + step1[17] = _mm256_sub_epi16(step1[17], s3_17_0); + step1[18] = _mm256_sub_epi16(step1[18], s3_18_0); + step1[19] = _mm256_sub_epi16(step1[19], s3_19_0); + step2[20] = _mm256_sub_epi16(step2[20], s3_20_0); + step2[21] = _mm256_sub_epi16(step2[21], s3_21_0); + step2[22] = _mm256_sub_epi16(step2[22], s3_22_0); + step2[23] = _mm256_sub_epi16(step2[23], s3_23_0); + step2[24] = _mm256_sub_epi16(step2[24], s3_24_0); + step2[25] = _mm256_sub_epi16(step2[25], s3_25_0); + step2[26] = _mm256_sub_epi16(step2[26], s3_26_0); + step2[27] = _mm256_sub_epi16(step2[27], s3_27_0); + step1[28] = _mm256_sub_epi16(step1[28], s3_28_0); + step1[29] = _mm256_sub_epi16(step1[29], s3_29_0); + step1[30] = _mm256_sub_epi16(step1[30], s3_30_0); + step1[31] = _mm256_sub_epi16(step1[31], s3_31_0); + + step2[0] = _mm256_add_epi16(step2[0], kOne); + step2[1] = _mm256_add_epi16(step2[1], kOne); + step2[2] = _mm256_add_epi16(step2[2], kOne); + step2[3] = _mm256_add_epi16(step2[3], kOne); + step2[4] = _mm256_add_epi16(step2[4], kOne); + step2[5] = _mm256_add_epi16(step2[5], kOne); + step2[6] = _mm256_add_epi16(step2[6], kOne); + step2[7] = _mm256_add_epi16(step2[7], kOne); + step2[8] = _mm256_add_epi16(step2[8], kOne); + step2[9] = _mm256_add_epi16(step2[9], kOne); + step2[10] = _mm256_add_epi16(step2[10], kOne); + step2[11] = _mm256_add_epi16(step2[11], kOne); + step2[12] = _mm256_add_epi16(step2[12], kOne); + step2[13] = _mm256_add_epi16(step2[13], kOne); + step2[14] = _mm256_add_epi16(step2[14], kOne); + step2[15] = _mm256_add_epi16(step2[15], kOne); + step1[16] = _mm256_add_epi16(step1[16], kOne); + step1[17] = _mm256_add_epi16(step1[17], kOne); + step1[18] = _mm256_add_epi16(step1[18], kOne); + step1[19] = _mm256_add_epi16(step1[19], kOne); + step2[20] = _mm256_add_epi16(step2[20], kOne); + step2[21] = _mm256_add_epi16(step2[21], kOne); + step2[22] = _mm256_add_epi16(step2[22], kOne); + step2[23] = _mm256_add_epi16(step2[23], kOne); + step2[24] = _mm256_add_epi16(step2[24], kOne); + step2[25] = _mm256_add_epi16(step2[25], kOne); + step2[26] = _mm256_add_epi16(step2[26], kOne); + step2[27] = _mm256_add_epi16(step2[27], kOne); + step1[28] = _mm256_add_epi16(step1[28], kOne); + step1[29] = _mm256_add_epi16(step1[29], kOne); + step1[30] = _mm256_add_epi16(step1[30], kOne); + step1[31] = _mm256_add_epi16(step1[31], kOne); + + step2[0] = _mm256_srai_epi16(step2[0], 2); + step2[1] = _mm256_srai_epi16(step2[1], 2); + step2[2] = _mm256_srai_epi16(step2[2], 2); + step2[3] = _mm256_srai_epi16(step2[3], 2); + step2[4] = _mm256_srai_epi16(step2[4], 2); + step2[5] = _mm256_srai_epi16(step2[5], 2); + step2[6] = _mm256_srai_epi16(step2[6], 2); + step2[7] = _mm256_srai_epi16(step2[7], 2); + step2[8] = _mm256_srai_epi16(step2[8], 2); + step2[9] = _mm256_srai_epi16(step2[9], 2); + step2[10] = _mm256_srai_epi16(step2[10], 2); + step2[11] = _mm256_srai_epi16(step2[11], 2); + step2[12] = _mm256_srai_epi16(step2[12], 2); + step2[13] = _mm256_srai_epi16(step2[13], 2); + step2[14] = _mm256_srai_epi16(step2[14], 2); + step2[15] = _mm256_srai_epi16(step2[15], 2); + step1[16] = _mm256_srai_epi16(step1[16], 2); + step1[17] = _mm256_srai_epi16(step1[17], 2); + step1[18] = _mm256_srai_epi16(step1[18], 2); + step1[19] = _mm256_srai_epi16(step1[19], 2); + step2[20] = _mm256_srai_epi16(step2[20], 2); + step2[21] = _mm256_srai_epi16(step2[21], 2); + step2[22] = _mm256_srai_epi16(step2[22], 2); + step2[23] = _mm256_srai_epi16(step2[23], 2); + step2[24] = _mm256_srai_epi16(step2[24], 2); + step2[25] = _mm256_srai_epi16(step2[25], 2); + step2[26] = _mm256_srai_epi16(step2[26], 2); + step2[27] = _mm256_srai_epi16(step2[27], 2); + step1[28] = _mm256_srai_epi16(step1[28], 2); + step1[29] = _mm256_srai_epi16(step1[29], 2); + step1[30] = _mm256_srai_epi16(step1[30], 2); + step1[31] = _mm256_srai_epi16(step1[31], 2); + } +#endif + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm256_add_epi16(step2[23], step1[16]); + step3[17] = _mm256_add_epi16(step2[22], step1[17]); + step3[18] = _mm256_add_epi16(step2[21], step1[18]); + step3[19] = _mm256_add_epi16(step2[20], step1[19]); + step3[20] = _mm256_sub_epi16(step1[19], step2[20]); + step3[21] = _mm256_sub_epi16(step1[18], step2[21]); + step3[22] = _mm256_sub_epi16(step1[17], step2[22]); + step3[23] = _mm256_sub_epi16(step1[16], step2[23]); + step3[24] = _mm256_sub_epi16(step1[31], step2[24]); + step3[25] = _mm256_sub_epi16(step1[30], step2[25]); + step3[26] = _mm256_sub_epi16(step1[29], step2[26]); + step3[27] = _mm256_sub_epi16(step1[28], step2[27]); + step3[28] = _mm256_add_epi16(step2[27], step1[28]); + step3[29] = _mm256_add_epi16(step2[26], step1[29]); + step3[30] = _mm256_add_epi16(step2[25], step1[30]); + step3[31] = _mm256_add_epi16(step2[24], step1[31]); + } + + // Stage 4 + { + step1[0] = _mm256_add_epi16(step3[3], step3[0]); + step1[1] = _mm256_add_epi16(step3[2], step3[1]); + step1[2] = _mm256_sub_epi16(step3[1], step3[2]); + step1[3] = _mm256_sub_epi16(step3[0], step3[3]); + step1[8] = _mm256_add_epi16(step3[11], step2[8]); + step1[9] = _mm256_add_epi16(step3[10], step2[9]); + step1[10] = _mm256_sub_epi16(step2[9], step3[10]); + step1[11] = _mm256_sub_epi16(step2[8], step3[11]); + step1[12] = _mm256_sub_epi16(step2[15], step3[12]); + step1[13] = _mm256_sub_epi16(step2[14], step3[13]); + step1[14] = _mm256_add_epi16(step3[13], step2[14]); + step1[15] = _mm256_add_epi16(step3[12], step2[15]); + } + { + const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); + const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); + const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s1_05_4 = + _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m256i s1_05_5 = + _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m256i s1_06_4 = + _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m256i s1_06_5 = + _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); + const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); + const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); + const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); + const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); + const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); + const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); + const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); + const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s1_18_4 = + _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m256i s1_18_5 = + _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m256i s1_19_4 = + _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m256i s1_19_5 = + _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m256i s1_20_4 = + _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m256i s1_20_5 = + _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m256i s1_21_4 = + _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m256i s1_21_5 = + _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m256i s1_26_4 = + _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m256i s1_26_5 = + _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m256i s1_27_4 = + _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m256i s1_27_5 = + _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m256i s1_28_4 = + _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m256i s1_28_5 = + _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m256i s1_29_4 = + _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m256i s1_29_5 = + _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm256_add_epi16(step1[5], step3[4]); + step2[5] = _mm256_sub_epi16(step3[4], step1[5]); + step2[6] = _mm256_sub_epi16(step3[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[6], step3[7]); + } + { + const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); + const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); + const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); + const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); + const __m256i out_00_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m256i out_00_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m256i out_16_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m256i out_16_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m256i out_08_2 = + _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m256i out_08_3 = + _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m256i out_24_2 = + _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m256i out_24_3 = + _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m256i out_00_4 = + _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m256i out_00_5 = + _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m256i out_16_4 = + _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m256i out_16_5 = + _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m256i out_08_4 = + _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m256i out_08_5 = + _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m256i out_24_4 = + _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m256i out_24_5 = + _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm256_packs_epi32(out_00_6, out_00_7); + out[16] = _mm256_packs_epi32(out_16_6, out_16_7); + out[8] = _mm256_packs_epi32(out_08_6, out_08_7); + out[24] = _mm256_packs_epi32(out_24_6, out_24_7); + } + { + const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]); + const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]); + const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); + const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); + const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s2_09_4 = + _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m256i s2_09_5 = + _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m256i s2_10_4 = + _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m256i s2_10_5 = + _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m256i s2_13_4 = + _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m256i s2_13_5 = + _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m256i s2_14_4 = + _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m256i s2_14_5 = + _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm256_add_epi16(step1[19], step3[16]); + step2[17] = _mm256_add_epi16(step1[18], step3[17]); + step2[18] = _mm256_sub_epi16(step3[17], step1[18]); + step2[19] = _mm256_sub_epi16(step3[16], step1[19]); + step2[20] = _mm256_sub_epi16(step3[23], step1[20]); + step2[21] = _mm256_sub_epi16(step3[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[21], step3[22]); + step2[23] = _mm256_add_epi16(step1[20], step3[23]); + step2[24] = _mm256_add_epi16(step1[27], step3[24]); + step2[25] = _mm256_add_epi16(step1[26], step3[25]); + step2[26] = _mm256_sub_epi16(step3[25], step1[26]); + step2[27] = _mm256_sub_epi16(step3[24], step1[27]); + step2[28] = _mm256_sub_epi16(step3[31], step1[28]); + step2[29] = _mm256_sub_epi16(step3[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step3[30]); + step2[31] = _mm256_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_04_2 = + _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m256i out_04_3 = + _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m256i out_20_2 = + _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m256i out_20_3 = + _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m256i out_12_2 = + _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m256i out_12_3 = + _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m256i out_28_2 = + _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m256i out_28_3 = + _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m256i out_04_4 = + _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m256i out_04_5 = + _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m256i out_20_4 = + _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m256i out_20_5 = + _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m256i out_12_4 = + _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m256i out_12_5 = + _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m256i out_28_4 = + _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m256i out_28_5 = + _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm256_packs_epi32(out_04_6, out_04_7); + out[20] = _mm256_packs_epi32(out_20_6, out_20_7); + out[12] = _mm256_packs_epi32(out_12_6, out_12_7); + out[28] = _mm256_packs_epi32(out_28_6, out_28_7); + } + { + step3[8] = _mm256_add_epi16(step2[9], step1[8]); + step3[9] = _mm256_sub_epi16(step1[8], step2[9]); + step3[10] = _mm256_sub_epi16(step1[11], step2[10]); + step3[11] = _mm256_add_epi16(step2[10], step1[11]); + step3[12] = _mm256_add_epi16(step2[13], step1[12]); + step3[13] = _mm256_sub_epi16(step1[12], step2[13]); + step3[14] = _mm256_sub_epi16(step1[15], step2[14]); + step3[15] = _mm256_add_epi16(step2[14], step1[15]); + } + { + const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); + const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); + const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); + const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); + const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); + const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); + const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); + const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); + const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m256i s3_17_4 = + _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m256i s3_17_5 = + _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m256i s3_18_4 = + _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m256i s3_18_5 = + _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m256i s3_21_4 = + _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m256i s3_21_5 = + _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m256i s3_22_4 = + _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m256i s3_22_5 = + _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m256i s3_25_4 = + _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m256i s3_25_5 = + _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m256i s3_26_4 = + _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m256i s3_26_5 = + _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m256i s3_29_4 = + _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m256i s3_29_5 = + _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m256i s3_30_4 = + _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m256i s3_30_5 = + _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]); + const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]); + const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]); + const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]); + const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); + const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); + const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); + const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); + const __m256i out_02_2 = + _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m256i out_02_3 = + _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m256i out_18_2 = + _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m256i out_18_3 = + _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m256i out_10_2 = + _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m256i out_10_3 = + _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m256i out_26_2 = + _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m256i out_26_3 = + _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m256i out_06_2 = + _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m256i out_06_3 = + _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m256i out_22_2 = + _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m256i out_22_3 = + _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m256i out_14_2 = + _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m256i out_14_3 = + _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m256i out_30_2 = + _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m256i out_30_3 = + _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m256i out_02_4 = + _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m256i out_02_5 = + _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m256i out_18_4 = + _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m256i out_18_5 = + _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m256i out_10_4 = + _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m256i out_10_5 = + _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m256i out_26_4 = + _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m256i out_26_5 = + _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m256i out_06_4 = + _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m256i out_06_5 = + _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m256i out_22_4 = + _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m256i out_22_5 = + _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m256i out_14_4 = + _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m256i out_14_5 = + _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m256i out_30_4 = + _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m256i out_30_5 = + _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm256_packs_epi32(out_02_6, out_02_7); + out[18] = _mm256_packs_epi32(out_18_6, out_18_7); + out[10] = _mm256_packs_epi32(out_10_6, out_10_7); + out[26] = _mm256_packs_epi32(out_26_6, out_26_7); + out[6] = _mm256_packs_epi32(out_06_6, out_06_7); + out[22] = _mm256_packs_epi32(out_22_6, out_22_7); + out[14] = _mm256_packs_epi32(out_14_6, out_14_7); + out[30] = _mm256_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm256_add_epi16(step3[17], step2[16]); + step1[17] = _mm256_sub_epi16(step2[16], step3[17]); + step1[18] = _mm256_sub_epi16(step2[19], step3[18]); + step1[19] = _mm256_add_epi16(step3[18], step2[19]); + step1[20] = _mm256_add_epi16(step3[21], step2[20]); + step1[21] = _mm256_sub_epi16(step2[20], step3[21]); + step1[22] = _mm256_sub_epi16(step2[23], step3[22]); + step1[23] = _mm256_add_epi16(step3[22], step2[23]); + step1[24] = _mm256_add_epi16(step3[25], step2[24]); + step1[25] = _mm256_sub_epi16(step2[24], step3[25]); + step1[26] = _mm256_sub_epi16(step2[27], step3[26]); + step1[27] = _mm256_add_epi16(step3[26], step2[27]); + step1[28] = _mm256_add_epi16(step3[29], step2[28]); + step1[29] = _mm256_sub_epi16(step2[28], step3[29]); + step1[30] = _mm256_sub_epi16(step2[31], step3[30]); + step1[31] = _mm256_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); + const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); + const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); + const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); + const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); + const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); + const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); + const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); + const __m256i out_01_2 = + _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m256i out_01_3 = + _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m256i out_17_2 = + _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m256i out_17_3 = + _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m256i out_09_2 = + _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m256i out_09_3 = + _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m256i out_25_2 = + _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m256i out_25_3 = + _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m256i out_07_2 = + _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m256i out_07_3 = + _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m256i out_23_2 = + _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m256i out_23_3 = + _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m256i out_15_2 = + _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m256i out_15_3 = + _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m256i out_31_2 = + _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m256i out_31_3 = + _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m256i out_01_4 = + _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m256i out_01_5 = + _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m256i out_17_4 = + _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m256i out_17_5 = + _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m256i out_09_4 = + _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m256i out_09_5 = + _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m256i out_25_4 = + _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m256i out_25_5 = + _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m256i out_07_4 = + _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m256i out_07_5 = + _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m256i out_23_4 = + _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m256i out_23_5 = + _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m256i out_15_4 = + _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m256i out_15_5 = + _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m256i out_31_4 = + _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m256i out_31_5 = + _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm256_packs_epi32(out_01_6, out_01_7); + out[17] = _mm256_packs_epi32(out_17_6, out_17_7); + out[9] = _mm256_packs_epi32(out_09_6, out_09_7); + out[25] = _mm256_packs_epi32(out_25_6, out_25_7); + out[7] = _mm256_packs_epi32(out_07_6, out_07_7); + out[23] = _mm256_packs_epi32(out_23_6, out_23_7); + out[15] = _mm256_packs_epi32(out_15_6, out_15_7); + out[31] = _mm256_packs_epi32(out_31_6, out_31_7); + } + { + const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); + const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); + const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); + const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); + const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); + const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); + const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); + const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); + const __m256i out_05_2 = + _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m256i out_05_3 = + _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m256i out_21_2 = + _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m256i out_21_3 = + _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m256i out_13_2 = + _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m256i out_13_3 = + _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m256i out_29_2 = + _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m256i out_29_3 = + _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m256i out_03_2 = + _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m256i out_03_3 = + _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m256i out_19_2 = + _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m256i out_19_3 = + _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m256i out_11_2 = + _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m256i out_11_3 = + _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m256i out_27_2 = + _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m256i out_27_3 = + _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m256i out_05_4 = + _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m256i out_05_5 = + _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m256i out_21_4 = + _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m256i out_21_5 = + _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m256i out_13_4 = + _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m256i out_13_5 = + _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m256i out_29_4 = + _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m256i out_29_5 = + _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m256i out_03_4 = + _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m256i out_03_5 = + _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m256i out_19_4 = + _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m256i out_19_5 = + _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m256i out_11_4 = + _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m256i out_11_5 = + _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m256i out_27_4 = + _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m256i out_27_5 = + _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm256_packs_epi32(out_05_6, out_05_7); + out[21] = _mm256_packs_epi32(out_21_6, out_21_7); + out[13] = _mm256_packs_epi32(out_13_6, out_13_7); + out[29] = _mm256_packs_epi32(out_29_6, out_29_7); + out[3] = _mm256_packs_epi32(out_03_6, out_03_7); + out[19] = _mm256_packs_epi32(out_19_6, out_19_7); + out[11] = _mm256_packs_epi32(out_11_6, out_11_7); + out[27] = _mm256_packs_epi32(out_27_6, out_27_7); + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m256i lstep1[64], lstep2[64], lstep3[64]; + __m256i u[32], v[32], sign[16]; + const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length priori to addition operations + lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero); + lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero); + lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero); + lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero); + lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero); + lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero); + lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero); + lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero); + lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero); + lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero); + lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero); + lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero); + lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero); + lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero); + lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero); + lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero); + lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne); + lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne); + lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne); + lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne); + lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne); + lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne); + lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne); + lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne); + lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne); + lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne); + lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne); + lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne); + lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne); + lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne); + lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne); + lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne); + + lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]); + lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]); + lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]); + lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]); + lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]); + lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]); + lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]); + lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]); + lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]); + lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]); + lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]); + lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]); + lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]); + lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]); + lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]); + lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero); + lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero); + lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero); + lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero); + lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero); + lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero); + lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero); + lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero); + lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero); + lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero); + lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero); + lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero); + lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero); + lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero); + lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero); + lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero); + lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne); + lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne); + lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne); + lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne); + lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne); + lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne); + lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne); + lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne); + lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne); + lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne); + lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne); + lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne); + lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne); + lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne); + lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne); + lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne); + + lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero); + lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero); + lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero); + lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero); + lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero); + lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero); + lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero); + lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero); + lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero); + lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero); + lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero); + lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero); + lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero); + lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero); + lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero); + lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero); + lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne); + lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne); + lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne); + lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne); + lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne); + lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne); + lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne); + lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne); + lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne); + lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne); + lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne); + lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne); + lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne); + lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne); + lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne); + lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne); + + lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]); + lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]); + + lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]); + lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]); + lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]); + lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]); + lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]); + lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]); + lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]); + lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]); + lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]); + lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]); + lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]); + lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]); + lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]); + lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]); + lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]); + lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]); + lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]); + lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]); + lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]); + lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]); + lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]); + lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]); + lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]); + lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]); + lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]); + lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]); + lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]); + lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]); + lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]); + lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]); + } + + // stage 4 + { + // expanding to 32-bit length priori to addition operations + lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero); + lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero); + lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero); + lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero); + lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero); + lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero); + lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero); + lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero); + lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne); + lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne); + lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne); + lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne); + lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne); + lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne); + lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne); + lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne); + + lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]); + lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08); + v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08); + v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08); + v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08); + v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08); + v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08); + v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08); + v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08); + v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24); + v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24); + v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24); + v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24); + v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24); + v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24); + v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24); + v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24); + v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]); + lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + // Combine + out[0] = _mm256_packs_epi32(u[0], u[1]); + out[16] = _mm256_packs_epi32(u[2], u[3]); + out[8] = _mm256_packs_epi32(u[4], u[5]); + out[24] = _mm256_packs_epi32(u[6], u[7]); + } + { + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08); + v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08); + v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08); + v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08); + v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); + u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04); + v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20); + v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20); + v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20); + v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28); + v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28); + v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28); + v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + out[4] = _mm256_packs_epi32(u[0], u[1]); + out[20] = _mm256_packs_epi32(u[2], u[3]); + out[12] = _mm256_packs_epi32(u[4], u[5]); + out[28] = _mm256_packs_epi32(u[6], u[7]); + } + { + lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + const __m256i k32_m28_m04 = + pair256_set_epi32(-cospi_28_64, -cospi_4_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m12_m20 = + pair256_set_epi32(-cospi_12_64, -cospi_20_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28); + v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28); + v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28); + v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28); + v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04); + v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04); + v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04); + v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20); + v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20); + v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20); + v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20); + v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12); + v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12); + v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12); + v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12); + v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20); + v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20); + v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20); + v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20); + v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28); + v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28); + v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28); + v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28); + v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m256i k32_p30_p02 = + pair256_set_epi32(cospi_30_64, cospi_2_64); + const __m256i k32_p14_p18 = + pair256_set_epi32(cospi_14_64, cospi_18_64); + const __m256i k32_p22_p10 = + pair256_set_epi32(cospi_22_64, cospi_10_64); + const __m256i k32_p06_p26 = + pair256_set_epi32(cospi_6_64, cospi_26_64); + const __m256i k32_m26_p06 = + pair256_set_epi32(-cospi_26_64, cospi_6_64); + const __m256i k32_m10_p22 = + pair256_set_epi32(-cospi_10_64, cospi_22_64); + const __m256i k32_m18_p14 = + pair256_set_epi32(-cospi_18_64, cospi_14_64); + const __m256i k32_m02_p30 = + pair256_set_epi32(-cospi_2_64, cospi_30_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02); + v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02); + v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02); + v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02); + v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18); + v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18); + v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18); + v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18); + v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10); + v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10); + v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10); + v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10); + v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26); + v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26); + v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26); + v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26); + v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06); + v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06); + v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06); + v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06); + v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22); + v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22); + v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22); + v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22); + v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14); + v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14); + v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14); + v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14); + v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30); + v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30); + v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30); + v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[2] = _mm256_packs_epi32(u[0], u[1]); + out[18] = _mm256_packs_epi32(u[2], u[3]); + out[10] = _mm256_packs_epi32(u[4], u[5]); + out[26] = _mm256_packs_epi32(u[6], u[7]); + out[6] = _mm256_packs_epi32(u[8], u[9]); + out[22] = _mm256_packs_epi32(u[10], u[11]); + out[14] = _mm256_packs_epi32(u[12], u[13]); + out[30] = _mm256_packs_epi32(u[14], u[15]); + } + { + lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m256i k32_p31_p01 = + pair256_set_epi32(cospi_31_64, cospi_1_64); + const __m256i k32_p15_p17 = + pair256_set_epi32(cospi_15_64, cospi_17_64); + const __m256i k32_p23_p09 = + pair256_set_epi32(cospi_23_64, cospi_9_64); + const __m256i k32_p07_p25 = + pair256_set_epi32(cospi_7_64, cospi_25_64); + const __m256i k32_m25_p07 = + pair256_set_epi32(-cospi_25_64, cospi_7_64); + const __m256i k32_m09_p23 = + pair256_set_epi32(-cospi_9_64, cospi_23_64); + const __m256i k32_m17_p15 = + pair256_set_epi32(-cospi_17_64, cospi_15_64); + const __m256i k32_m01_p31 = + pair256_set_epi32(-cospi_1_64, cospi_31_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01); + v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01); + v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01); + v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01); + v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17); + v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17); + v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17); + v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17); + v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09); + v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09); + v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09); + v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09); + v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25); + v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25); + v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25); + v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25); + v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07); + v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07); + v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07); + v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07); + v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23); + v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23); + v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23); + v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23); + v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15); + v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15); + v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15); + v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15); + v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31); + v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31); + v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31); + v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[1] = _mm256_packs_epi32(u[0], u[1]); + out[17] = _mm256_packs_epi32(u[2], u[3]); + out[9] = _mm256_packs_epi32(u[4], u[5]); + out[25] = _mm256_packs_epi32(u[6], u[7]); + out[7] = _mm256_packs_epi32(u[8], u[9]); + out[23] = _mm256_packs_epi32(u[10], u[11]); + out[15] = _mm256_packs_epi32(u[12], u[13]); + out[31] = _mm256_packs_epi32(u[14], u[15]); + } + { + const __m256i k32_p27_p05 = + pair256_set_epi32(cospi_27_64, cospi_5_64); + const __m256i k32_p11_p21 = + pair256_set_epi32(cospi_11_64, cospi_21_64); + const __m256i k32_p19_p13 = + pair256_set_epi32(cospi_19_64, cospi_13_64); + const __m256i k32_p03_p29 = + pair256_set_epi32(cospi_3_64, cospi_29_64); + const __m256i k32_m29_p03 = + pair256_set_epi32(-cospi_29_64, cospi_3_64); + const __m256i k32_m13_p19 = + pair256_set_epi32(-cospi_13_64, cospi_19_64); + const __m256i k32_m21_p11 = + pair256_set_epi32(-cospi_21_64, cospi_11_64); + const __m256i k32_m05_p27 = + pair256_set_epi32(-cospi_5_64, cospi_27_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05); + v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05); + v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05); + v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05); + v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21); + v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21); + v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21); + v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21); + v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13); + v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13); + v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13); + v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13); + v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29); + v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29); + v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29); + v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29); + v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03); + v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03); + v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03); + v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03); + v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19); + v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19); + v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19); + v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19); + v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11); + v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11); + v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11); + v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11); + v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27); + v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27); + v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27); + v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[5] = _mm256_packs_epi32(u[0], u[1]); + out[21] = _mm256_packs_epi32(u[2], u[3]); + out[13] = _mm256_packs_epi32(u[4], u[5]); + out[29] = _mm256_packs_epi32(u[6], u[7]); + out[3] = _mm256_packs_epi32(u[8], u[9]); + out[19] = _mm256_packs_epi32(u[10], u[11]); + out[11] = _mm256_packs_epi32(u[12], u[13]); + out[27] = _mm256_packs_epi32(u[14], u[15]); + } + } +#endif + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output_currStep, *output_nextStep; + tran_low_t *curr_out, *next_out; + // Pass 0 + output_currStep = &intermediate[column_start * 32]; + output_nextStep = &intermediate[(column_start + 8) * 32]; + // Pass 1 + curr_out = &output_org[column_start * 32]; + next_out = &output_org[(column_start + 8) * 32]; + + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m256i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 + // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 + // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 + // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 + // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 + // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 + // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 + const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]); + const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]); + const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]); + const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]); + const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]); + const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]); + const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]); + const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]); + // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31 + // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71 + // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35 + // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75 + // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101 + // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151 + // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115 + // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155 + + const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); + const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); + const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); + const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); + const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); + const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); + const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); + const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); + // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69 + // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73 + // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71 + // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75 + // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149 + // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153 + // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151 + // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155 + __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148 + // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149 + // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150 + // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151 + // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152 + // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153 + // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154 + // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero); + __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero); + __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero); + __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero); + __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero); + __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero); + __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero); + __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in av1/encoder/av1_dct.c + tr2_0 = _mm256_add_epi16(tr2_0, kOne); + tr2_1 = _mm256_add_epi16(tr2_1, kOne); + tr2_2 = _mm256_add_epi16(tr2_2, kOne); + tr2_3 = _mm256_add_epi16(tr2_3, kOne); + tr2_4 = _mm256_add_epi16(tr2_4, kOne); + tr2_5 = _mm256_add_epi16(tr2_5, kOne); + tr2_6 = _mm256_add_epi16(tr2_6, kOne); + tr2_7 = _mm256_add_epi16(tr2_7, kOne); + tr2_0 = _mm256_srai_epi16(tr2_0, 2); + tr2_1 = _mm256_srai_epi16(tr2_1, 2); + tr2_2 = _mm256_srai_epi16(tr2_2, 2); + tr2_3 = _mm256_srai_epi16(tr2_3, 2); + tr2_4 = _mm256_srai_epi16(tr2_4, 2); + tr2_5 = _mm256_srai_epi16(tr2_5, 2); + tr2_6 = _mm256_srai_epi16(tr2_6, 2); + tr2_7 = _mm256_srai_epi16(tr2_7, 2); + } + if (0 == pass) { + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), + _mm256_castsi256_si128(tr2_0)); + _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), + _mm256_castsi256_si128(tr2_1)); + _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), + _mm256_castsi256_si128(tr2_2)); + _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), + _mm256_castsi256_si128(tr2_3)); + _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), + _mm256_castsi256_si128(tr2_4)); + _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), + _mm256_castsi256_si128(tr2_5)); + _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), + _mm256_castsi256_si128(tr2_6)); + _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), + _mm256_castsi256_si128(tr2_7)); + + _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), + _mm256_extractf128_si256(tr2_0, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), + _mm256_extractf128_si256(tr2_1, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), + _mm256_extractf128_si256(tr2_2, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), + _mm256_extractf128_si256(tr2_3, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), + _mm256_extractf128_si256(tr2_4, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), + _mm256_extractf128_si256(tr2_5, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), + _mm256_extractf128_si256(tr2_6, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), + _mm256_extractf128_si256(tr2_7, 1)); + // Process next 8x8 + output_currStep += 8; + output_nextStep += 8; + } + if (1 == pass) { + store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32); + store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32); + store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32); + store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32); + store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32); + store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32); + store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32); + store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32); + curr_out += 8; + next_out += 8; + } + } + } + } + } + _mm256_zeroupper(); +} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h new file mode 100644 index 000000000..69dd6af11 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -0,0 +1,3201 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "aom_dsp/fwd_txfm.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +// TODO(jingning) The high bit-depth version needs re-work for performance. +// The current SSE2 implementation also causes cross reference to the static +// functions in the C implementation file. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 +#if FDCT32x32_HIGH_PRECISION +void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) { + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} +#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c +#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c +#else +void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) { + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + aom_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + } +} +#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c +#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c +#endif // FDCT32x32_HIGH_PRECISION +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif // DCT_HIGH_BIT_DEPTH + +void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { + // Calculate pre-multiplied strides + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + int pass; +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 8) { + __m128i step1[32]; + __m128i step2[32]; + __m128i step3[32]; + __m128i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + const int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m128i *step1a = &step1[0]; + __m128i *step1b = &step1[31]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m128i *step1a = &step1[4]; + __m128i *step1b = &step1[27]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m128i *step1a = &step1[8]; + __m128i *step1b = &step1[23]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; + __m128i *step1a = &step1[12]; + __m128i *step1b = &step1[19]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); + __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); + __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); + __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); + __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); + __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); + __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); + __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); + step1[0] = ADD_EPI16(in00, in31); + step1[1] = ADD_EPI16(in01, in30); + step1[2] = ADD_EPI16(in02, in29); + step1[3] = ADD_EPI16(in03, in28); + step1[28] = SUB_EPI16(in03, in28); + step1[29] = SUB_EPI16(in02, in29); + step1[30] = SUB_EPI16(in01, in30); + step1[31] = SUB_EPI16(in00, in31); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2], + &step1[3], &step1[28], &step1[29], + &step1[30], &step1[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); + __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); + __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); + __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); + __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); + __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); + __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); + __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); + step1[4] = ADD_EPI16(in04, in27); + step1[5] = ADD_EPI16(in05, in26); + step1[6] = ADD_EPI16(in06, in25); + step1[7] = ADD_EPI16(in07, in24); + step1[24] = SUB_EPI16(in07, in24); + step1[25] = SUB_EPI16(in06, in25); + step1[26] = SUB_EPI16(in05, in26); + step1[27] = SUB_EPI16(in04, in27); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6], + &step1[7], &step1[24], &step1[25], + &step1[26], &step1[27]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); + __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); + __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); + __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); + __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); + __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); + __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); + __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); + step1[8] = ADD_EPI16(in08, in23); + step1[9] = ADD_EPI16(in09, in22); + step1[10] = ADD_EPI16(in10, in21); + step1[11] = ADD_EPI16(in11, in20); + step1[20] = SUB_EPI16(in11, in20); + step1[21] = SUB_EPI16(in10, in21); + step1[22] = SUB_EPI16(in09, in22); + step1[23] = SUB_EPI16(in08, in23); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10], + &step1[11], &step1[20], &step1[21], + &step1[22], &step1[23]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); + __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); + __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); + __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); + __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); + __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); + __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); + __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); + step1[12] = ADD_EPI16(in12, in19); + step1[13] = ADD_EPI16(in13, in18); + step1[14] = ADD_EPI16(in14, in17); + step1[15] = ADD_EPI16(in15, in16); + step1[16] = SUB_EPI16(in15, in16); + step1[17] = SUB_EPI16(in14, in17); + step1[18] = SUB_EPI16(in13, in18); + step1[19] = SUB_EPI16(in12, in19); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14], + &step1[15], &step1[16], &step1[17], + &step1[18], &step1[19]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Stage 2 + { + step2[0] = ADD_EPI16(step1[0], step1[15]); + step2[1] = ADD_EPI16(step1[1], step1[14]); + step2[2] = ADD_EPI16(step1[2], step1[13]); + step2[3] = ADD_EPI16(step1[3], step1[12]); + step2[4] = ADD_EPI16(step1[4], step1[11]); + step2[5] = ADD_EPI16(step1[5], step1[10]); + step2[6] = ADD_EPI16(step1[6], step1[9]); + step2[7] = ADD_EPI16(step1[7], step1[8]); + step2[8] = SUB_EPI16(step1[7], step1[8]); + step2[9] = SUB_EPI16(step1[6], step1[9]); + step2[10] = SUB_EPI16(step1[5], step1[10]); + step2[11] = SUB_EPI16(step1[4], step1[11]); + step2[12] = SUB_EPI16(step1[3], step1[12]); + step2[13] = SUB_EPI16(step1[2], step1[13]); + step2[14] = SUB_EPI16(step1[1], step1[14]); + step2[15] = SUB_EPI16(step1[0], step1[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22], + &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero); + __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero); + __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero); + __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero); + __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero); + __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero); + __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero); + __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero); + __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero); + __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero); + __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); + __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); + __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); + __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); + __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); + __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); + __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); + __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); + __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); + __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); + __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); + __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); + __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); + __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); + __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); + __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); + __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); + __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); + __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); + __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); + __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); + __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); + + step2[0] = SUB_EPI16(step2[0], s3_00_0); + step2[1] = SUB_EPI16(step2[1], s3_01_0); + step2[2] = SUB_EPI16(step2[2], s3_02_0); + step2[3] = SUB_EPI16(step2[3], s3_03_0); + step2[4] = SUB_EPI16(step2[4], s3_04_0); + step2[5] = SUB_EPI16(step2[5], s3_05_0); + step2[6] = SUB_EPI16(step2[6], s3_06_0); + step2[7] = SUB_EPI16(step2[7], s3_07_0); + step2[8] = SUB_EPI16(step2[8], s2_08_0); + step2[9] = SUB_EPI16(step2[9], s2_09_0); + step2[10] = SUB_EPI16(step2[10], s3_10_0); + step2[11] = SUB_EPI16(step2[11], s3_11_0); + step2[12] = SUB_EPI16(step2[12], s3_12_0); + step2[13] = SUB_EPI16(step2[13], s3_13_0); + step2[14] = SUB_EPI16(step2[14], s2_14_0); + step2[15] = SUB_EPI16(step2[15], s2_15_0); + step1[16] = SUB_EPI16(step1[16], s3_16_0); + step1[17] = SUB_EPI16(step1[17], s3_17_0); + step1[18] = SUB_EPI16(step1[18], s3_18_0); + step1[19] = SUB_EPI16(step1[19], s3_19_0); + step2[20] = SUB_EPI16(step2[20], s3_20_0); + step2[21] = SUB_EPI16(step2[21], s3_21_0); + step2[22] = SUB_EPI16(step2[22], s3_22_0); + step2[23] = SUB_EPI16(step2[23], s3_23_0); + step2[24] = SUB_EPI16(step2[24], s3_24_0); + step2[25] = SUB_EPI16(step2[25], s3_25_0); + step2[26] = SUB_EPI16(step2[26], s3_26_0); + step2[27] = SUB_EPI16(step2[27], s3_27_0); + step1[28] = SUB_EPI16(step1[28], s3_28_0); + step1[29] = SUB_EPI16(step1[29], s3_29_0); + step1[30] = SUB_EPI16(step1[30], s3_30_0); + step1[31] = SUB_EPI16(step1[31], s3_31_0); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x32( + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15], &step1[16], + &step1[17], &step1[18], &step1[19], &step2[20], &step2[21], + &step2[22], &step2[23], &step2[24], &step2[25], &step2[26], + &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + step2[0] = _mm_add_epi16(step2[0], kOne); + step2[1] = _mm_add_epi16(step2[1], kOne); + step2[2] = _mm_add_epi16(step2[2], kOne); + step2[3] = _mm_add_epi16(step2[3], kOne); + step2[4] = _mm_add_epi16(step2[4], kOne); + step2[5] = _mm_add_epi16(step2[5], kOne); + step2[6] = _mm_add_epi16(step2[6], kOne); + step2[7] = _mm_add_epi16(step2[7], kOne); + step2[8] = _mm_add_epi16(step2[8], kOne); + step2[9] = _mm_add_epi16(step2[9], kOne); + step2[10] = _mm_add_epi16(step2[10], kOne); + step2[11] = _mm_add_epi16(step2[11], kOne); + step2[12] = _mm_add_epi16(step2[12], kOne); + step2[13] = _mm_add_epi16(step2[13], kOne); + step2[14] = _mm_add_epi16(step2[14], kOne); + step2[15] = _mm_add_epi16(step2[15], kOne); + step1[16] = _mm_add_epi16(step1[16], kOne); + step1[17] = _mm_add_epi16(step1[17], kOne); + step1[18] = _mm_add_epi16(step1[18], kOne); + step1[19] = _mm_add_epi16(step1[19], kOne); + step2[20] = _mm_add_epi16(step2[20], kOne); + step2[21] = _mm_add_epi16(step2[21], kOne); + step2[22] = _mm_add_epi16(step2[22], kOne); + step2[23] = _mm_add_epi16(step2[23], kOne); + step2[24] = _mm_add_epi16(step2[24], kOne); + step2[25] = _mm_add_epi16(step2[25], kOne); + step2[26] = _mm_add_epi16(step2[26], kOne); + step2[27] = _mm_add_epi16(step2[27], kOne); + step1[28] = _mm_add_epi16(step1[28], kOne); + step1[29] = _mm_add_epi16(step1[29], kOne); + step1[30] = _mm_add_epi16(step1[30], kOne); + step1[31] = _mm_add_epi16(step1[31], kOne); + + step2[0] = _mm_srai_epi16(step2[0], 2); + step2[1] = _mm_srai_epi16(step2[1], 2); + step2[2] = _mm_srai_epi16(step2[2], 2); + step2[3] = _mm_srai_epi16(step2[3], 2); + step2[4] = _mm_srai_epi16(step2[4], 2); + step2[5] = _mm_srai_epi16(step2[5], 2); + step2[6] = _mm_srai_epi16(step2[6], 2); + step2[7] = _mm_srai_epi16(step2[7], 2); + step2[8] = _mm_srai_epi16(step2[8], 2); + step2[9] = _mm_srai_epi16(step2[9], 2); + step2[10] = _mm_srai_epi16(step2[10], 2); + step2[11] = _mm_srai_epi16(step2[11], 2); + step2[12] = _mm_srai_epi16(step2[12], 2); + step2[13] = _mm_srai_epi16(step2[13], 2); + step2[14] = _mm_srai_epi16(step2[14], 2); + step2[15] = _mm_srai_epi16(step2[15], 2); + step1[16] = _mm_srai_epi16(step1[16], 2); + step1[17] = _mm_srai_epi16(step1[17], 2); + step1[18] = _mm_srai_epi16(step1[18], 2); + step1[19] = _mm_srai_epi16(step1[19], 2); + step2[20] = _mm_srai_epi16(step2[20], 2); + step2[21] = _mm_srai_epi16(step2[21], 2); + step2[22] = _mm_srai_epi16(step2[22], 2); + step2[23] = _mm_srai_epi16(step2[23], 2); + step2[24] = _mm_srai_epi16(step2[24], 2); + step2[25] = _mm_srai_epi16(step2[25], 2); + step2[26] = _mm_srai_epi16(step2[26], 2); + step2[27] = _mm_srai_epi16(step2[27], 2); + step1[28] = _mm_srai_epi16(step1[28], 2); + step1[29] = _mm_srai_epi16(step1[29], 2); + step1[30] = _mm_srai_epi16(step1[30], 2); + step1[31] = _mm_srai_epi16(step1[31], 2); + } +#endif // !FDCT32x32_HIGH_PRECISION + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); + step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); + step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); + step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); + step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); + step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); + step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); + step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], + &step3[3], &step3[4], &step3[5], + &step3[6], &step3[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12], + &step3[13]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step3[16] = ADD_EPI16(step2[23], step1[16]); + step3[17] = ADD_EPI16(step2[22], step1[17]); + step3[18] = ADD_EPI16(step2[21], step1[18]); + step3[19] = ADD_EPI16(step2[20], step1[19]); + step3[20] = SUB_EPI16(step1[19], step2[20]); + step3[21] = SUB_EPI16(step1[18], step2[21]); + step3[22] = SUB_EPI16(step1[17], step2[22]); + step3[23] = SUB_EPI16(step1[16], step2[23]); + step3[24] = SUB_EPI16(step1[31], step2[24]); + step3[25] = SUB_EPI16(step1[30], step2[25]); + step3[26] = SUB_EPI16(step1[29], step2[26]); + step3[27] = SUB_EPI16(step1[28], step2[27]); + step3[28] = ADD_EPI16(step2[27], step1[28]); + step3[29] = ADD_EPI16(step2[26], step1[29]); + step3[30] = ADD_EPI16(step2[25], step1[30]); + step3[31] = ADD_EPI16(step2[24], step1[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step3[16], &step3[17], &step3[18], &step3[19], &step3[20], + &step3[21], &step3[22], &step3[23], &step3[24], &step3[25], + &step3[26], &step3[27], &step3[28], &step3[29], &step3[30], + &step3[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + + // Stage 4 + { + step1[0] = ADD_EPI16(step3[3], step3[0]); + step1[1] = ADD_EPI16(step3[2], step3[1]); + step1[2] = SUB_EPI16(step3[1], step3[2]); + step1[3] = SUB_EPI16(step3[0], step3[3]); + step1[8] = ADD_EPI16(step3[11], step2[8]); + step1[9] = ADD_EPI16(step3[10], step2[9]); + step1[10] = SUB_EPI16(step2[9], step3[10]); + step1[11] = SUB_EPI16(step2[8], step3[11]); + step1[12] = SUB_EPI16(step2[15], step3[12]); + step1[13] = SUB_EPI16(step2[14], step3[13]); + step1[14] = ADD_EPI16(step3[13], step2[14]); + step1[15] = ADD_EPI16(step3[12], step2[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5], + &step1[6], &step1[7], &step1[8], &step1[9], &step1[10], + &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], + &step1[21], &step1[26], &step1[27], + &step1[28], &step1[29]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 5 + { + step2[4] = ADD_EPI16(step1[5], step3[4]); + step2[5] = SUB_EPI16(step3[4], step1[5]); + step2[6] = SUB_EPI16(step3[7], step1[6]); + step2[7] = ADD_EPI16(step1[6], step3[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6], + &step2[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = + _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = + _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = + _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = + _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = + _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = + _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = + _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = + _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13], + &step2[14]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step2[16] = ADD_EPI16(step1[19], step3[16]); + step2[17] = ADD_EPI16(step1[18], step3[17]); + step2[18] = SUB_EPI16(step3[17], step1[18]); + step2[19] = SUB_EPI16(step3[16], step1[19]); + step2[20] = SUB_EPI16(step3[23], step1[20]); + step2[21] = SUB_EPI16(step3[22], step1[21]); + step2[22] = ADD_EPI16(step1[21], step3[22]); + step2[23] = ADD_EPI16(step1[20], step3[23]); + step2[24] = ADD_EPI16(step1[27], step3[24]); + step2[25] = ADD_EPI16(step1[26], step3[25]); + step2[26] = SUB_EPI16(step3[25], step1[26]); + step2[27] = SUB_EPI16(step3[24], step1[27]); + step2[28] = SUB_EPI16(step3[31], step1[28]); + step2[29] = SUB_EPI16(step3[30], step1[29]); + step2[30] = ADD_EPI16(step1[29], step3[30]); + step2[31] = ADD_EPI16(step1[28], step3[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step2[16], &step2[17], &step2[18], &step2[19], &step2[20], + &step2[21], &step2[22], &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27], &step2[28], &step2[29], &step2[30], + &step2[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = + _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = + _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = + _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = + _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = + _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = + _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = + _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = + _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step3[8] = ADD_EPI16(step2[9], step1[8]); + step3[9] = SUB_EPI16(step1[8], step2[9]); + step3[10] = SUB_EPI16(step1[11], step2[10]); + step3[11] = ADD_EPI16(step2[10], step1[11]); + step3[12] = ADD_EPI16(step2[13], step1[12]); + step3[13] = SUB_EPI16(step1[12], step2[13]); + step3[14] = SUB_EPI16(step1[15], step2[14]); + step3[15] = ADD_EPI16(step2[14], step1[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], + &step3[11], &step3[12], &step3[13], + &step3[14], &step3[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], + &step3[22], &step3[25], &step3[26], + &step3[29], &step3[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = + _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = + _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = + _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = + _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = + _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = + _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = + _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = + _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = + _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = + _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = + _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = + _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = + _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = + _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = + _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = + _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step1[16] = ADD_EPI16(step3[17], step2[16]); + step1[17] = SUB_EPI16(step2[16], step3[17]); + step1[18] = SUB_EPI16(step2[19], step3[18]); + step1[19] = ADD_EPI16(step3[18], step2[19]); + step1[20] = ADD_EPI16(step3[21], step2[20]); + step1[21] = SUB_EPI16(step2[20], step3[21]); + step1[22] = SUB_EPI16(step2[23], step3[22]); + step1[23] = ADD_EPI16(step3[22], step2[23]); + step1[24] = ADD_EPI16(step3[25], step2[24]); + step1[25] = SUB_EPI16(step2[24], step3[25]); + step1[26] = SUB_EPI16(step2[27], step3[26]); + step1[27] = ADD_EPI16(step3[26], step2[27]); + step1[28] = ADD_EPI16(step3[29], step2[28]); + step1[29] = SUB_EPI16(step2[28], step3[29]); + step1[30] = SUB_EPI16(step2[31], step3[30]); + step1[31] = ADD_EPI16(step3[30], step2[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step1[16], &step1[17], &step1[18], &step1[19], &step1[20], + &step1[21], &step1[22], &step1[23], &step1[24], &step1[25], + &step1[26], &step1[27], &step1[28], &step1[29], &step1[30], + &step1[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = + _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = + _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = + _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = + _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = + _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = + _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = + _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = + _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = + _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = + _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = + _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = + _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = + _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = + _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = + _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = + _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = + _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = + _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = + _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = + _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = + _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = + _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = + _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = + _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = + _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = + _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = + _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = + _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = + _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = + _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = + _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = + _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m128i lstep1[64], lstep2[64], lstep3[64]; + __m128i u[32], v[32], sign[16]; + const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length priori to addition operations + lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero); + lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero); + lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero); + lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero); + lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero); + lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero); + lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero); + lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero); + lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero); + lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero); + lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero); + lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero); + lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero); + lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero); + lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero); + lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero); + lstep2[0] = _mm_madd_epi16(lstep2[0], kOne); + lstep2[1] = _mm_madd_epi16(lstep2[1], kOne); + lstep2[2] = _mm_madd_epi16(lstep2[2], kOne); + lstep2[3] = _mm_madd_epi16(lstep2[3], kOne); + lstep2[4] = _mm_madd_epi16(lstep2[4], kOne); + lstep2[5] = _mm_madd_epi16(lstep2[5], kOne); + lstep2[6] = _mm_madd_epi16(lstep2[6], kOne); + lstep2[7] = _mm_madd_epi16(lstep2[7], kOne); + lstep2[8] = _mm_madd_epi16(lstep2[8], kOne); + lstep2[9] = _mm_madd_epi16(lstep2[9], kOne); + lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); + lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); + lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); + lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); + lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); + lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); + + lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]); + lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]); + lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]); + lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]); + lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]); + lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]); + lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]); + lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]); + lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]); + lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]); + lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]); + lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]); + lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]); + lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]); + lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]); + lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); + lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); + lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); + lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); + lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); + lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); + lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); + lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); + lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); + lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); + lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); + lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); + lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); + lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); + lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); + lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); + lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); + lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); + lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); + lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); + lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); + lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); + lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); + lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); + lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); + lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); + lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); + lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); + lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); + lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); + lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); + lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); + + lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); + lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); + lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); + lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); + lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); + lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); + lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); + lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); + lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); + lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); + lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); + lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); + lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); + lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); + lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); + lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); + lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); + lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); + lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); + lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); + lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); + lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); + lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); + lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); + lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); + lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); + lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); + lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); + lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); + lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); + lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); + lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); + + lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); + lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); + + lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); + lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); + lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); + lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); + lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); + lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); + lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); + lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); + lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); + lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); + lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); + lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); + lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); + lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); + lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); + lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); + lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); + lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); + lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); + lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); + lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); + lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); + lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); + lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); + lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); + lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); + lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); + lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); + lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); + lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); + } + + // stage 4 + { + // expanding to 32-bit length priori to addition operations + lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero); + lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero); + lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero); + lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero); + lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); + lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); + lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); + lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); + lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); + lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); + lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); + lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); + lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); + lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); + lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); + lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); + + lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]); + lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[0] = k_madd_epi32(u[0], k32_p16_m16); + v[1] = k_madd_epi32(u[1], k32_p16_m16); + v[2] = k_madd_epi32(u[2], k32_p16_m16); + v[3] = k_madd_epi32(u[3], k32_p16_m16); + v[4] = k_madd_epi32(u[0], k32_p16_p16); + v[5] = k_madd_epi32(u[1], k32_p16_p16); + v[6] = k_madd_epi32(u[2], k32_p16_p16); + v[7] = k_madd_epi32(u[3], k32_p16_p16); +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4], + &v[5], &v[6], &v[7], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m08_p24); + v[5] = k_madd_epi32(u[5], k32_m08_p24); + v[6] = k_madd_epi32(u[6], k32_m08_p24); + v[7] = k_madd_epi32(u[7], k32_m08_p24); + v[8] = k_madd_epi32(u[8], k32_m24_m08); + v[9] = k_madd_epi32(u[9], k32_m24_m08); + v[10] = k_madd_epi32(u[10], k32_m24_m08); + v[11] = k_madd_epi32(u[11], k32_m24_m08); + v[12] = k_madd_epi32(u[12], k32_m24_m08); + v[13] = k_madd_epi32(u[13], k32_m24_m08); + v[14] = k_madd_epi32(u[14], k32_m24_m08); + v[15] = k_madd_epi32(u[15], k32_m24_m08); + v[16] = k_madd_epi32(u[12], k32_m08_p24); + v[17] = k_madd_epi32(u[13], k32_m08_p24); + v[18] = k_madd_epi32(u[14], k32_m08_p24); + v[19] = k_madd_epi32(u[15], k32_m08_p24); + v[20] = k_madd_epi32(u[8], k32_m08_p24); + v[21] = k_madd_epi32(u[9], k32_m08_p24); + v[22] = k_madd_epi32(u[10], k32_m08_p24); + v[23] = k_madd_epi32(u[11], k32_m08_p24); + v[24] = k_madd_epi32(u[4], k32_p24_p08); + v[25] = k_madd_epi32(u[5], k32_p24_p08); + v[26] = k_madd_epi32(u[6], k32_p24_p08); + v[27] = k_madd_epi32(u[7], k32_p24_p08); + v[28] = k_madd_epi32(u[0], k32_p24_p08); + v[29] = k_madd_epi32(u[1], k32_p24_p08); + v[30] = k_madd_epi32(u[2], k32_p24_p08); + v[31] = k_madd_epi32(u[3], k32_p24_p08); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]); + lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[0] = k_madd_epi32(u[0], k32_p16_p16); + v[1] = k_madd_epi32(u[1], k32_p16_p16); + v[2] = k_madd_epi32(u[2], k32_p16_p16); + v[3] = k_madd_epi32(u[3], k32_p16_p16); + v[4] = k_madd_epi32(u[0], k32_p16_m16); + v[5] = k_madd_epi32(u[1], k32_p16_m16); + v[6] = k_madd_epi32(u[2], k32_p16_m16); + v[7] = k_madd_epi32(u[3], k32_p16_m16); + v[8] = k_madd_epi32(u[4], k32_p24_p08); + v[9] = k_madd_epi32(u[5], k32_p24_p08); + v[10] = k_madd_epi32(u[6], k32_p24_p08); + v[11] = k_madd_epi32(u[7], k32_p24_p08); + v[12] = k_madd_epi32(u[4], k32_m08_p24); + v[13] = k_madd_epi32(u[5], k32_m08_p24); + v[14] = k_madd_epi32(u[6], k32_m08_p24); + v[15] = k_madd_epi32(u[7], k32_m08_p24); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + // Combine + out[0] = _mm_packs_epi32(u[0], u[1]); + out[16] = _mm_packs_epi32(u[2], u[3]); + out[8] = _mm_packs_epi32(u[4], u[5]); + out[24] = _mm_packs_epi32(u[6], u[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m24_m08); + v[5] = k_madd_epi32(u[5], k32_m24_m08); + v[6] = k_madd_epi32(u[6], k32_m24_m08); + v[7] = k_madd_epi32(u[7], k32_m24_m08); + v[8] = k_madd_epi32(u[4], k32_m08_p24); + v[9] = k_madd_epi32(u[5], k32_m08_p24); + v[10] = k_madd_epi32(u[6], k32_m08_p24); + v[11] = k_madd_epi32(u[7], k32_m08_p24); + v[12] = k_madd_epi32(u[0], k32_p24_p08); + v[13] = k_madd_epi32(u[1], k32_p24_p08); + v[14] = k_madd_epi32(u[2], k32_p24_p08); + v[15] = k_madd_epi32(u[3], k32_p24_p08); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); + u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); + + v[0] = k_madd_epi32(u[0], k32_p28_p04); + v[1] = k_madd_epi32(u[1], k32_p28_p04); + v[2] = k_madd_epi32(u[2], k32_p28_p04); + v[3] = k_madd_epi32(u[3], k32_p28_p04); + v[4] = k_madd_epi32(u[4], k32_p12_p20); + v[5] = k_madd_epi32(u[5], k32_p12_p20); + v[6] = k_madd_epi32(u[6], k32_p12_p20); + v[7] = k_madd_epi32(u[7], k32_p12_p20); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m04_p28); + v[13] = k_madd_epi32(u[13], k32_m04_p28); + v[14] = k_madd_epi32(u[14], k32_m04_p28); + v[15] = k_madd_epi32(u[15], k32_m04_p28); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + out[4] = _mm_packs_epi32(u[0], u[1]); + out[20] = _mm_packs_epi32(u[2], u[3]); + out[12] = _mm_packs_epi32(u[4], u[5]); + out[28] = _mm_packs_epi32(u[6], u[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m12_m20 = + pair_set_epi32(-cospi_12_64, -cospi_20_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + + u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[0] = k_madd_epi32(u[0], k32_m04_p28); + v[1] = k_madd_epi32(u[1], k32_m04_p28); + v[2] = k_madd_epi32(u[2], k32_m04_p28); + v[3] = k_madd_epi32(u[3], k32_m04_p28); + v[4] = k_madd_epi32(u[4], k32_m28_m04); + v[5] = k_madd_epi32(u[5], k32_m28_m04); + v[6] = k_madd_epi32(u[6], k32_m28_m04); + v[7] = k_madd_epi32(u[7], k32_m28_m04); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m12_m20); + v[13] = k_madd_epi32(u[13], k32_m12_m20); + v[14] = k_madd_epi32(u[14], k32_m12_m20); + v[15] = k_madd_epi32(u[15], k32_m12_m20); + v[16] = k_madd_epi32(u[12], k32_m20_p12); + v[17] = k_madd_epi32(u[13], k32_m20_p12); + v[18] = k_madd_epi32(u[14], k32_m20_p12); + v[19] = k_madd_epi32(u[15], k32_m20_p12); + v[20] = k_madd_epi32(u[8], k32_p12_p20); + v[21] = k_madd_epi32(u[9], k32_p12_p20); + v[22] = k_madd_epi32(u[10], k32_p12_p20); + v[23] = k_madd_epi32(u[11], k32_p12_p20); + v[24] = k_madd_epi32(u[4], k32_m04_p28); + v[25] = k_madd_epi32(u[5], k32_m04_p28); + v[26] = k_madd_epi32(u[6], k32_m04_p28); + v[27] = k_madd_epi32(u[7], k32_m04_p28); + v[28] = k_madd_epi32(u[0], k32_p28_p04); + v[29] = k_madd_epi32(u[1], k32_p28_p04); + v[30] = k_madd_epi32(u[2], k32_p28_p04); + v[31] = k_madd_epi32(u[3], k32_p28_p04); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); + const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); + const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); + const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); + const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); + const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); + const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); + const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); + + u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[0] = k_madd_epi32(u[0], k32_p30_p02); + v[1] = k_madd_epi32(u[1], k32_p30_p02); + v[2] = k_madd_epi32(u[2], k32_p30_p02); + v[3] = k_madd_epi32(u[3], k32_p30_p02); + v[4] = k_madd_epi32(u[4], k32_p14_p18); + v[5] = k_madd_epi32(u[5], k32_p14_p18); + v[6] = k_madd_epi32(u[6], k32_p14_p18); + v[7] = k_madd_epi32(u[7], k32_p14_p18); + v[8] = k_madd_epi32(u[8], k32_p22_p10); + v[9] = k_madd_epi32(u[9], k32_p22_p10); + v[10] = k_madd_epi32(u[10], k32_p22_p10); + v[11] = k_madd_epi32(u[11], k32_p22_p10); + v[12] = k_madd_epi32(u[12], k32_p06_p26); + v[13] = k_madd_epi32(u[13], k32_p06_p26); + v[14] = k_madd_epi32(u[14], k32_p06_p26); + v[15] = k_madd_epi32(u[15], k32_p06_p26); + v[16] = k_madd_epi32(u[12], k32_m26_p06); + v[17] = k_madd_epi32(u[13], k32_m26_p06); + v[18] = k_madd_epi32(u[14], k32_m26_p06); + v[19] = k_madd_epi32(u[15], k32_m26_p06); + v[20] = k_madd_epi32(u[8], k32_m10_p22); + v[21] = k_madd_epi32(u[9], k32_m10_p22); + v[22] = k_madd_epi32(u[10], k32_m10_p22); + v[23] = k_madd_epi32(u[11], k32_m10_p22); + v[24] = k_madd_epi32(u[4], k32_m18_p14); + v[25] = k_madd_epi32(u[5], k32_m18_p14); + v[26] = k_madd_epi32(u[6], k32_m18_p14); + v[27] = k_madd_epi32(u[7], k32_m18_p14); + v[28] = k_madd_epi32(u[0], k32_m02_p30); + v[29] = k_madd_epi32(u[1], k32_m02_p30); + v[30] = k_madd_epi32(u[2], k32_m02_p30); + v[31] = k_madd_epi32(u[3], k32_m02_p30); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[2] = _mm_packs_epi32(u[0], u[1]); + out[18] = _mm_packs_epi32(u[2], u[3]); + out[10] = _mm_packs_epi32(u[4], u[5]); + out[26] = _mm_packs_epi32(u[6], u[7]); + out[6] = _mm_packs_epi32(u[8], u[9]); + out[22] = _mm_packs_epi32(u[10], u[11]); + out[14] = _mm_packs_epi32(u[12], u[13]); + out[30] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); + const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); + const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); + const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); + const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); + const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); + const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); + const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); + + u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[0] = k_madd_epi32(u[0], k32_p31_p01); + v[1] = k_madd_epi32(u[1], k32_p31_p01); + v[2] = k_madd_epi32(u[2], k32_p31_p01); + v[3] = k_madd_epi32(u[3], k32_p31_p01); + v[4] = k_madd_epi32(u[4], k32_p15_p17); + v[5] = k_madd_epi32(u[5], k32_p15_p17); + v[6] = k_madd_epi32(u[6], k32_p15_p17); + v[7] = k_madd_epi32(u[7], k32_p15_p17); + v[8] = k_madd_epi32(u[8], k32_p23_p09); + v[9] = k_madd_epi32(u[9], k32_p23_p09); + v[10] = k_madd_epi32(u[10], k32_p23_p09); + v[11] = k_madd_epi32(u[11], k32_p23_p09); + v[12] = k_madd_epi32(u[12], k32_p07_p25); + v[13] = k_madd_epi32(u[13], k32_p07_p25); + v[14] = k_madd_epi32(u[14], k32_p07_p25); + v[15] = k_madd_epi32(u[15], k32_p07_p25); + v[16] = k_madd_epi32(u[12], k32_m25_p07); + v[17] = k_madd_epi32(u[13], k32_m25_p07); + v[18] = k_madd_epi32(u[14], k32_m25_p07); + v[19] = k_madd_epi32(u[15], k32_m25_p07); + v[20] = k_madd_epi32(u[8], k32_m09_p23); + v[21] = k_madd_epi32(u[9], k32_m09_p23); + v[22] = k_madd_epi32(u[10], k32_m09_p23); + v[23] = k_madd_epi32(u[11], k32_m09_p23); + v[24] = k_madd_epi32(u[4], k32_m17_p15); + v[25] = k_madd_epi32(u[5], k32_m17_p15); + v[26] = k_madd_epi32(u[6], k32_m17_p15); + v[27] = k_madd_epi32(u[7], k32_m17_p15); + v[28] = k_madd_epi32(u[0], k32_m01_p31); + v[29] = k_madd_epi32(u[1], k32_m01_p31); + v[30] = k_madd_epi32(u[2], k32_m01_p31); + v[31] = k_madd_epi32(u[3], k32_m01_p31); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[1] = _mm_packs_epi32(u[0], u[1]); + out[17] = _mm_packs_epi32(u[2], u[3]); + out[9] = _mm_packs_epi32(u[4], u[5]); + out[25] = _mm_packs_epi32(u[6], u[7]); + out[7] = _mm_packs_epi32(u[8], u[9]); + out[23] = _mm_packs_epi32(u[10], u[11]); + out[15] = _mm_packs_epi32(u[12], u[13]); + out[31] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); + const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); + const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); + const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); + const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); + const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); + const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); + const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); + + u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[0] = k_madd_epi32(u[0], k32_p27_p05); + v[1] = k_madd_epi32(u[1], k32_p27_p05); + v[2] = k_madd_epi32(u[2], k32_p27_p05); + v[3] = k_madd_epi32(u[3], k32_p27_p05); + v[4] = k_madd_epi32(u[4], k32_p11_p21); + v[5] = k_madd_epi32(u[5], k32_p11_p21); + v[6] = k_madd_epi32(u[6], k32_p11_p21); + v[7] = k_madd_epi32(u[7], k32_p11_p21); + v[8] = k_madd_epi32(u[8], k32_p19_p13); + v[9] = k_madd_epi32(u[9], k32_p19_p13); + v[10] = k_madd_epi32(u[10], k32_p19_p13); + v[11] = k_madd_epi32(u[11], k32_p19_p13); + v[12] = k_madd_epi32(u[12], k32_p03_p29); + v[13] = k_madd_epi32(u[13], k32_p03_p29); + v[14] = k_madd_epi32(u[14], k32_p03_p29); + v[15] = k_madd_epi32(u[15], k32_p03_p29); + v[16] = k_madd_epi32(u[12], k32_m29_p03); + v[17] = k_madd_epi32(u[13], k32_m29_p03); + v[18] = k_madd_epi32(u[14], k32_m29_p03); + v[19] = k_madd_epi32(u[15], k32_m29_p03); + v[20] = k_madd_epi32(u[8], k32_m13_p19); + v[21] = k_madd_epi32(u[9], k32_m13_p19); + v[22] = k_madd_epi32(u[10], k32_m13_p19); + v[23] = k_madd_epi32(u[11], k32_m13_p19); + v[24] = k_madd_epi32(u[4], k32_m21_p11); + v[25] = k_madd_epi32(u[5], k32_m21_p11); + v[26] = k_madd_epi32(u[6], k32_m21_p11); + v[27] = k_madd_epi32(u[7], k32_m21_p11); + v[28] = k_madd_epi32(u[0], k32_m05_p27); + v[29] = k_madd_epi32(u[1], k32_m05_p27); + v[30] = k_madd_epi32(u[2], k32_m05_p27); + v[31] = k_madd_epi32(u[3], k32_m05_p27); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[5] = _mm_packs_epi32(u[0], u[1]); + out[21] = _mm_packs_epi32(u[2], u[3]); + out[13] = _mm_packs_epi32(u[4], u[5]); + out[29] = _mm_packs_epi32(u[6], u[7]); + out[3] = _mm_packs_epi32(u[8], u[9]); + out[19] = _mm_packs_epi32(u[10], u[11]); + out[11] = _mm_packs_epi32(u[12], u[13]); + out[27] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } +#endif // FDCT32x32_HIGH_PRECISION + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output0 = &intermediate[column_start * 32]; + tran_low_t *output1 = &output_org[column_start * 32]; + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m128i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); + __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); + __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); + __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); + __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); + __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); + __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); + __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in av1/encoder/av1_dct.c + tr2_0 = _mm_add_epi16(tr2_0, kOne); + tr2_1 = _mm_add_epi16(tr2_1, kOne); + tr2_2 = _mm_add_epi16(tr2_2, kOne); + tr2_3 = _mm_add_epi16(tr2_3, kOne); + tr2_4 = _mm_add_epi16(tr2_4, kOne); + tr2_5 = _mm_add_epi16(tr2_5, kOne); + tr2_6 = _mm_add_epi16(tr2_6, kOne); + tr2_7 = _mm_add_epi16(tr2_7, kOne); + tr2_0 = _mm_srai_epi16(tr2_0, 2); + tr2_1 = _mm_srai_epi16(tr2_1, 2); + tr2_2 = _mm_srai_epi16(tr2_2, 2); + tr2_3 = _mm_srai_epi16(tr2_3, 2); + tr2_4 = _mm_srai_epi16(tr2_4, 2); + tr2_5 = _mm_srai_epi16(tr2_5, 2); + tr2_6 = _mm_srai_epi16(tr2_6, 2); + tr2_7 = _mm_srai_epi16(tr2_7, 2); + } + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + if (pass == 0) { + _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0); + _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1); + _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2); + _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3); + _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4); + _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5); + _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6); + _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7); + // Process next 8x8 + output0 += 8; + } else { + storeu_output(&tr2_0, (output1 + 0 * 32)); + storeu_output(&tr2_1, (output1 + 1 * 32)); + storeu_output(&tr2_2, (output1 + 2 * 32)); + storeu_output(&tr2_3, (output1 + 3 * 32)); + storeu_output(&tr2_4, (output1 + 4 * 32)); + storeu_output(&tr2_5, (output1 + 5 * 32)); + storeu_output(&tr2_6, (output1 + 6 * 32)); + storeu_output(&tr2_7, (output1 + 7 * 32)); + // Process next 8x8 + output1 += 8; + } + } + } + } + } +} // NOLINT + +#undef ADD_EPI16 +#undef SUB_EPI16 +#undef HIGH_FDCT32x32_2D_C +#undef HIGH_FDCT32x32_2D_ROWS_C diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c new file mode 100644 index 000000000..670f864d0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" + +#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h new file mode 100644 index 000000000..d3aceae00 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H +#define AOM_DSP_X86_FWD_TXFM_AVX2_H + +#include "./aom_config.h" + +static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { +#if CONFIG_HIGHBITDEPTH + const __m256i zero = _mm256_setzero_si256(); + const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); + + __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); + __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); + + __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); + __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); + + _mm256_storeu_si256((__m256i *)out, y0); + _mm256_storeu_si256((__m256i *)(out + 8), y1); +#else + _mm256_storeu_si256((__m256i *)out, *coeff); +#endif +} + +#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h new file mode 100644 index 000000000..7bb1db70a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -0,0 +1,1014 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +// TODO(jingning) The high bit-depth functions need rework for performance. +// After we properly fix the high bit-depth function implementations, this +// file's dependency should be substantially simplified. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 + +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif + +void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { + // This 2D transform implements 4 vertical 1D transforms followed + // by 4 horizontal 1D transforms. The multiplies and adds are as given + // by Chen, Smith and Fralick ('77). The commands for moving the data + // around have been minimized by hand. + // For the purposes of the comments, the 16 inputs are referred to at i0 + // through iF (in raster order), intermediate variables are a0, b0, c0 + // through f, and correspond to the in-place computations mapped to input + // locations. The outputs, o0 through oF are labeled according to the + // output locations. + + // Constants + // These are the coefficients used for the multiplies. + // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), + // where cospi_N_64 = cos(N pi /64) + const __m128i k__cospi_A = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); + + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // This second rounding constant saves doing some extra adds at the end + const __m128i k__DCT_CONST_ROUNDING2 = + _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + __m128i in0, in1; +#if DCT_HIGH_BIT_DEPTH + __m128i cmp0, cmp1; + int test, overflow; +#endif + + // Load inputs. + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); +#if DCT_HIGH_BIT_DEPTH + // Check inputs small enough to use optimised code + cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), + _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), + _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); + test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); + if (test) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // multiply by 16 to give some extra precision + in0 = _mm_slli_epi16(in0, 4); + in1 = _mm_slli_epi16(in1, 4); + // if (i == 0 && input[0]) input[0] += 1; + // add 1 to the upper left pixel if it is non-zero, which helps reduce + // the round-trip error + { + // The mask will only contain whether the first value is zero, all + // other comparison will fail as something shifted by 4 (above << 4) + // can never be equal to one. To increment in the non-zero case, we + // add the mask and one for the first element: + // - if zero, mask = -1, v = v - 1 + 1 = v + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 + __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); + in0 = _mm_add_epi16(in0, mask); + in0 = _mm_add_epi16(in0, k__nonzero_bias_b); + } + // There are 4 total stages, alternating between an add/subtract stage + // followed by an multiply-and-add stage. + { + // Stage 1: Add/subtract + + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + const __m128i r0 = _mm_unpacklo_epi16(in0, in1); + const __m128i r1 = _mm_unpackhi_epi16(in0, in1); + // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] + // r1 = [iC i8 iD i9 iE iA iF iB] + const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); + const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); + // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] + // r3 = [iC i8 iD i9 iF iB iE iA] + + const __m128i t0 = _mm_add_epi16(r2, r3); + const __m128i t1 = _mm_sub_epi16(r2, r3); + // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] + // t1 = [aC a8 aD a9 aF aB aE aA] + + // Stage 2: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] + // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] + // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] + // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); + // Then add and right-shift to get back to 16-bit range + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // w0 = [b0 b1 b7 b6] + // w1 = [b8 b9 bF bE] + // w2 = [b4 b5 b3 b2] + // w3 = [bC bD bB bA] + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&x0, &x1); + if (overflow) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // x0 = [b0 b1 b7 b6 b8 b9 bF bE] + // x1 = [b4 b5 b3 b2 bC bD bB bA] + in0 = _mm_shuffle_epi32(x0, 0xD8); + in1 = _mm_shuffle_epi32(x1, 0x8D); + // in0 = [b0 b1 b8 b9 b7 b6 bF bE] + // in1 = [b3 b2 bB bA b4 b5 bC bD] + } + { + // vertical DCTs finished. Now we do the horizontal DCTs. + // Stage 3: Add/subtract + + // t0 = [c0 c1 c8 c9 c4 c5 cC cD] + // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] + const __m128i t0 = ADD_EPI16(in0, in1); + const __m128i t1 = SUB_EPI16(in0, in1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&t0, &t1); + if (overflow) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // Stage 4: multiply by constants (which gets us into 32 bits). + { + // The constants needed here are: + // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] + // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] + // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] + // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); + const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); + const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); + // Then add and right-shift to get back to 16-bit range + // but this combines the final right-shift as well to save operations + // This unusual rounding operations is to maintain bit-accurate + // compatibility with the c version of this function which has two + // rounding steps in a row. + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); + // w0 = [o0 o4 o8 oC] + // w1 = [o2 o6 oA oE] + // w2 = [o1 o5 o9 oD] + // w3 = [o3 o7 oB oF] + // remember the o's are numbered according to the correct output location + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&x0, &x1); + if (overflow) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // x0 = [o0 o4 o8 oC o2 o6 oA oE] + // x1 = [o1 o5 o9 oD o3 o7 oB oF] + const __m128i y0 = _mm_unpacklo_epi16(x0, x1); + const __m128i y1 = _mm_unpackhi_epi16(x0, x1); + // y0 = [o0 o1 o4 o5 o8 o9 oC oD] + // y1 = [o2 o3 o6 o7 oA oB oE oF] + in0 = _mm_unpacklo_epi32(y0, y1); + // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] + in1 = _mm_unpackhi_epi32(y0, y1); + // in1 = [o8 o9 oA oB oC oD oE oF] + } + } + } + // Post-condition (v + 1) >> 2 is now incorporated into previous + // add and right-shift commands. Only 2 store instructions needed + // because we are using the fact that 1/3 are stored just after 0/2. + storeu_output(&in0, output + 0 * 4); + storeu_output(&in1, output + 2 * 4); +} + +void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = ADD_EPI16(in0, in7); + const __m128i q1 = ADD_EPI16(in1, in6); + const __m128i q2 = ADD_EPI16(in2, in5); + const __m128i q3 = ADD_EPI16(in3, in4); + const __m128i q4 = SUB_EPI16(in3, in4); + const __m128i q5 = SUB_EPI16(in2, in5); + const __m128i q6 = SUB_EPI16(in1, in6); + const __m128i q7 = SUB_EPI16(in0, in7); +#if DCT_HIGH_BIT_DEPTH + if (pass == 1) { + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + store_output(&in0, (output + 0 * 8)); + store_output(&in1, (output + 1 * 8)); + store_output(&in2, (output + 2 * 8)); + store_output(&in3, (output + 3 * 8)); + store_output(&in4, (output + 4 * 8)); + store_output(&in5, (output + 5 * 8)); + store_output(&in6, (output + 6 * 8)); + store_output(&in7, (output + 7 * 8)); + } +} + +void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[256]); + const int16_t *in = input; + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + for (column_start = 0; column_start < 16; column_start += 8) { + __m128i in00, in01, in02, in03, in04, in05, in06, in07; + __m128i in08, in09, in10, in11, in12, in13, in14, in15; + __m128i input0, input1, input2, input3, input4, input5, input6, input7; + __m128i step1_0, step1_1, step1_2, step1_3; + __m128i step1_4, step1_5, step1_6, step1_7; + __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + __m128i step3_0, step3_1, step3_2, step3_3; + __m128i step3_4, step3_5, step3_6, step3_7; + __m128i res00, res01, res02, res03, res04, res05, res06, res07; + __m128i res08, res09, res10, res11, res12, res13, res14, res15; + // Load and pre-condition input. + if (0 == pass) { + in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); + // x = x << 2 + in00 = _mm_slli_epi16(in00, 2); + in01 = _mm_slli_epi16(in01, 2); + in02 = _mm_slli_epi16(in02, 2); + in03 = _mm_slli_epi16(in03, 2); + in04 = _mm_slli_epi16(in04, 2); + in05 = _mm_slli_epi16(in05, 2); + in06 = _mm_slli_epi16(in06, 2); + in07 = _mm_slli_epi16(in07, 2); + in08 = _mm_slli_epi16(in08, 2); + in09 = _mm_slli_epi16(in09, 2); + in10 = _mm_slli_epi16(in10, 2); + in11 = _mm_slli_epi16(in11, 2); + in12 = _mm_slli_epi16(in12, 2); + in13 = _mm_slli_epi16(in13, 2); + in14 = _mm_slli_epi16(in14, 2); + in15 = _mm_slli_epi16(in15, 2); + } else { + in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); + // x = (x + 1) >> 2 + in00 = _mm_add_epi16(in00, kOne); + in01 = _mm_add_epi16(in01, kOne); + in02 = _mm_add_epi16(in02, kOne); + in03 = _mm_add_epi16(in03, kOne); + in04 = _mm_add_epi16(in04, kOne); + in05 = _mm_add_epi16(in05, kOne); + in06 = _mm_add_epi16(in06, kOne); + in07 = _mm_add_epi16(in07, kOne); + in08 = _mm_add_epi16(in08, kOne); + in09 = _mm_add_epi16(in09, kOne); + in10 = _mm_add_epi16(in10, kOne); + in11 = _mm_add_epi16(in11, kOne); + in12 = _mm_add_epi16(in12, kOne); + in13 = _mm_add_epi16(in13, kOne); + in14 = _mm_add_epi16(in14, kOne); + in15 = _mm_add_epi16(in15, kOne); + in00 = _mm_srai_epi16(in00, 2); + in01 = _mm_srai_epi16(in01, 2); + in02 = _mm_srai_epi16(in02, 2); + in03 = _mm_srai_epi16(in03, 2); + in04 = _mm_srai_epi16(in04, 2); + in05 = _mm_srai_epi16(in05, 2); + in06 = _mm_srai_epi16(in06, 2); + in07 = _mm_srai_epi16(in07, 2); + in08 = _mm_srai_epi16(in08, 2); + in09 = _mm_srai_epi16(in09, 2); + in10 = _mm_srai_epi16(in10, 2); + in11 = _mm_srai_epi16(in11, 2); + in12 = _mm_srai_epi16(in12, 2); + in13 = _mm_srai_epi16(in13, 2); + in14 = _mm_srai_epi16(in14, 2); + in15 = _mm_srai_epi16(in15, 2); + } + in += 8; + // Calculate input for the first 8 results. + { + input0 = ADD_EPI16(in00, in15); + input1 = ADD_EPI16(in01, in14); + input2 = ADD_EPI16(in02, in13); + input3 = ADD_EPI16(in03, in12); + input4 = ADD_EPI16(in04, in11); + input5 = ADD_EPI16(in05, in10); + input6 = ADD_EPI16(in06, in09); + input7 = ADD_EPI16(in07, in08); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, + &input4, &input5, &input6, &input7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Calculate input for the next 8 results. + { + step1_0 = SUB_EPI16(in07, in08); + step1_1 = SUB_EPI16(in06, in09); + step1_2 = SUB_EPI16(in05, in10); + step1_3 = SUB_EPI16(in04, in11); + step1_4 = SUB_EPI16(in03, in12); + step1_5 = SUB_EPI16(in02, in13); + step1_6 = SUB_EPI16(in01, in14); + step1_7 = SUB_EPI16(in00, in15); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m128i q0 = ADD_EPI16(input0, input7); + const __m128i q1 = ADD_EPI16(input1, input6); + const __m128i q2 = ADD_EPI16(input2, input5); + const __m128i q3 = ADD_EPI16(input3, input4); + const __m128i q4 = SUB_EPI16(input3, input4); + const __m128i q5 = SUB_EPI16(input2, input5); + const __m128i q6 = SUB_EPI16(input1, input6); + const __m128i q7 = SUB_EPI16(input0, input7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i r0 = + mult_round_shift(&d0, &d1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m128i r1 = + mult_round_shift(&d0, &d1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&res02, &res14, &res10, &res06); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); + step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 3 + { + step3_0 = ADD_EPI16(step1_0, step2_3); + step3_1 = ADD_EPI16(step1_1, step2_2); + step3_2 = SUB_EPI16(step1_1, step2_2); + step3_3 = SUB_EPI16(step1_0, step2_3); + step3_4 = SUB_EPI16(step1_7, step2_4); + step3_5 = SUB_EPI16(step1_6, step2_5); + step3_6 = ADD_EPI16(step1_6, step2_5); + step3_7 = ADD_EPI16(step1_7, step2_4); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3, + &step3_4, &step3_5, &step3_6, &step3_7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 4 + { + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); + step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 5 + { + step1_0 = ADD_EPI16(step3_0, step2_1); + step1_1 = SUB_EPI16(step3_0, step2_1); + step1_2 = ADD_EPI16(step3_3, step2_2); + step1_3 = SUB_EPI16(step3_3, step2_2); + step1_4 = SUB_EPI16(step3_4, step2_5); + step1_5 = ADD_EPI16(step3_4, step2_5); + step1_6 = SUB_EPI16(step3_7, step2_6); + step1_7 = ADD_EPI16(step3_7, step2_6); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 6 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); + res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); + res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Transpose the results, do it as two 8x8 transposes. + transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05, + &res06, &res07, pass, out0, out1); + transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13, + &res14, &res15, pass, out0 + 8, out1 + 8); + if (pass == 0) { + out0 += 8 * 16; + } else { + out1 += 8 * 16; + } + } + // Setup in/out for next pass. + in = intermediate; + } +} + +#undef ADD_EPI16 +#undef SUB_EPI16 diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c new file mode 100644 index 000000000..a337e618d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" + +void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); +} + +void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + input += 8 * stride; + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +#define DCT_HIGH_BIT_DEPTH 0 +#define FDCT4x4_2D aom_fdct4x4_sse2 +#define FDCT8x8_2D aom_fdct8x8_sse2 +#define FDCT16x16_2D aom_fdct16x16_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D + +#define FDCT32x32_2D aom_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D aom_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH + +#if CONFIG_HIGHBITDEPTH +#define DCT_HIGH_BIT_DEPTH 1 +#define FDCT4x4_2D aom_highbd_fdct4x4_sse2 +#define FDCT8x8_2D aom_highbd_fdct8x8_sse2 +#define FDCT16x16_2D aom_highbd_fdct16x16_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D + +#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D aom_highbd_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h new file mode 100644 index 000000000..26b2db2e0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_DSP_X86_FWD_TXFM_SSE2_H_ + +#include "aom_dsp/x86/txfm_common_intrin.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { + __m128i buf0, buf1; + buf0 = _mm_mul_epu32(a, b); + a = _mm_srli_epi64(a, 32); + b = _mm_srli_epi64(b, 32); + buf1 = _mm_mul_epu32(a, b); + return _mm_add_epi64(buf0, buf1); +} + +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm_unpacklo_epi64(buf0, buf1); +} + +static INLINE int check_epi16_overflow_x2(const __m128i *preg0, + const __m128i *preg1) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + cmp0 = _mm_or_si128(cmp0, cmp1); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), + _mm_cmpeq_epi16(*preg2, min_overflow)); + __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), + _mm_cmpeq_epi16(*preg3, min_overflow)); + cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x12( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + } + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) { + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + if (!res0) { + res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); + if (!res1) { + res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); + if (!res0) { + res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); + if (!res1) + res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); + } + } + } + } + } + return res0 + res1; +} + +static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *zero) { + __m128i minus_one = _mm_set1_epi32(-1); + // Check for overflows + __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); + __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); + __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); + __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); + __m128i reg0_top_dwords = + _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg1_top_dwords = + _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg2_top_dwords = + _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg3_top_dwords = + _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); + __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); + __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); + __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); + __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); + __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); + int overflow_01 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); + int overflow_23 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); + return (overflow_01 + overflow_23); +} + +static INLINE int k_check_epi32_overflow_8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + } + return overflow; +} + +static INLINE int k_check_epi32_overflow_16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); + } + } + } + return overflow; +} + +static INLINE int k_check_epi32_overflow_32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, + preg27, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, + preg31, zero); + } + } + } + } + } + } + } + return overflow; +} + +static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { +#if CONFIG_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); +#else + _mm_store_si128((__m128i *)(dst_ptr), *poutput); +#endif // CONFIG_HIGHBITDEPTH +} + +static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, + const __m128i *pmultiplier, + const __m128i *prounding, int shift) { + const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); + const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); + const __m128i v0 = _mm_add_epi32(u0, *prounding); + const __m128i v1 = _mm_add_epi32(u1, *prounding); + const __m128i w0 = _mm_srai_epi32(v0, shift); + const __m128i w1 = _mm_srai_epi32(v1, shift); + return _mm_packs_epi32(w0, w1); +} + +static INLINE void transpose_and_output8x8( + const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, + const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, + const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr, + tran_low_t *out1_ptr) { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); + const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); + const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); + const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); + const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); + const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); + const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); + const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (pass == 0) { + _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); + _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); + _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); + _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); + _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); + _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); + _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); + _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); + } else { + storeu_output(&tr2_0, (out1_ptr + 0 * 16)); + storeu_output(&tr2_1, (out1_ptr + 1 * 16)); + storeu_output(&tr2_2, (out1_ptr + 2 * 16)); + storeu_output(&tr2_3, (out1_ptr + 3 * 16)); + storeu_output(&tr2_4, (out1_ptr + 4 * 16)); + storeu_output(&tr2_5, (out1_ptr + 5 * 16)); + storeu_output(&tr2_6, (out1_ptr + 6 * 16)); + storeu_output(&tr2_7, (out1_ptr + 7 * 16)); + } +} + +void fdct32_8col(__m128i *in0, __m128i *in1); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm new file mode 100644 index 000000000..8fa1c04d0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the forward transformation. Part +; of the macro definitions are originally derived from the ffmpeg project. +; The current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +; 1D forward 8x8 DCT transform +%macro FDCT8_1D 1 + SUM_SUB 0, 7, 9 + SUM_SUB 1, 6, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 3, 4, 9 + + SUM_SUB 0, 3, 9 + SUM_SUB 1, 2, 9 + SUM_SUB 6, 5, 9 +%if %1 == 0 + SUM_SUB 0, 1, 9 +%endif + + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 + + pmulhrsw m6, m12 + pmulhrsw m5, m12 +%if %1 == 0 + pmulhrsw m0, m12 + pmulhrsw m1, m12 +%else + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 + SWAP 0, 1 +%endif + + SUM_SUB 4, 5, 9 + SUM_SUB 7, 6, 9 + BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 + SWAP 1, 4 + SWAP 3, 6 +%endmacro + +%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 + psraw m%3, m%1, 15 + psraw m%4, m%2, 15 + psubw m%1, m%3 + psubw m%2, m%4 + psraw m%1, 1 + psraw m%2, 1 +%endmacro + +%macro STORE_OUTPUT 2 ; index, result +%if CONFIG_HIGHBITDEPTH + ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + ; _mm_store_si128((__m128i *)(dst_ptr), out0); + ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + pxor m11, m11 + pcmpgtw m11, m%2 + movdqa m12, m%2 + punpcklwd m%2, m11 + punpckhwd m12, m11 + mova [outputq + 4*%1 + 0], m%2 + mova [outputq + 4*%1 + 16], m12 +%else + mova [outputq + 2*%1], m%2 +%endif +%endmacro + +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [pd_8192] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + FDCT8_1D 0 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + FDCT8_1D 1 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + DIVIDE_ROUND_2X 0, 1, 9, 10 + DIVIDE_ROUND_2X 2, 3, 9, 10 + DIVIDE_ROUND_2X 4, 5, 9, 10 + DIVIDE_ROUND_2X 6, 7, 9, 10 + + STORE_OUTPUT 0, 0 + STORE_OUTPUT 8, 1 + STORE_OUTPUT 16, 2 + STORE_OUTPUT 24, 3 + STORE_OUTPUT 32, 4 + STORE_OUTPUT 40, 5 + STORE_OUTPUT 48, 6 + STORE_OUTPUT 56, 7 + + RET +%endif diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm new file mode 100644 index 000000000..60446b086 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm @@ -0,0 +1,349 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, +; int ref_stride, +; unsigned char *src, +; int src_stride, +; unsigned int height, +; int *sum, +; unsigned int *sumsquared) +global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE +sym(aom_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref + + mov rdi, arg(2) ;src + movsxd rcx, dword ptr arg(4) ;height + movsxd rax, dword ptr arg(1) ;ref_stride + movsxd rdx, dword ptr arg(3) ;src_stride + + pxor xmm0, xmm0 ; + + movdqu xmm5, XMMWORD PTR [rsi] + movdqu xmm3, XMMWORD PTR [rsi+1] + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + + lea rsi, [rsi + rax] + +aom_half_horiz_vert_variance16x_h_1: + movdqu xmm1, XMMWORD PTR [rsi] ; + movdqu xmm2, XMMWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm4, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + psubw xmm5, xmm3 ; xmm5 -= xmm3 + + movq xmm3, QWORD PTR [rdi+8] + punpcklbw xmm3, xmm0 + psubw xmm4, xmm3 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz aom_half_horiz_vert_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_half_vert_variance16x_h_sse2(unsigned char *ref, +; int ref_stride, +; unsigned char *src, +; int src_stride, +; unsigned int height, +; int *sum, +; unsigned int *sumsquared) +global sym(aom_half_vert_variance16x_h_sse2) PRIVATE +sym(aom_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref + + mov rdi, arg(2) ;src + movsxd rcx, dword ptr arg(4) ;height + movsxd rax, dword ptr arg(1) ;ref_stride + movsxd rdx, dword ptr arg(3) ;src_stride + + movdqu xmm5, XMMWORD PTR [rsi] + lea rsi, [rsi + rax ] + pxor xmm0, xmm0 + +aom_half_vert_variance16x_h_1: + movdqu xmm3, XMMWORD PTR [rsi] + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 + punpckhbw xmm4, xmm0 + + movq xmm2, QWORD PTR [rdi] + punpcklbw xmm2, xmm0 + psubw xmm5, xmm2 + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + psubw xmm4, xmm2 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm3 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 + jnz aom_half_vert_variance16x_h_1 + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref, +; int ref_stride +; unsigned char *src, +; int src_stride, +; unsigned int height, +; int *sum, +; unsigned int *sumsquared) +global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE +sym(aom_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref + + mov rdi, arg(2) ;src + movsxd rcx, dword ptr arg(4) ;height + movsxd rax, dword ptr arg(1) ;ref_stride + movsxd rdx, dword ptr arg(3) ;src_stride + + pxor xmm0, xmm0 ; + +aom_half_horiz_variance16x_h_1: + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm1, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm1, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + psubw xmm1, xmm2 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm1 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm1, xmm1 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz aom_half_horiz_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +aom_bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c new file mode 100644 index 000000000..a99c0b40e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, + int ref_stride, + const unsigned char *src, + int src_stride, unsigned int height, + int *sum, unsigned int *sumsquared); +void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride, + const unsigned char *src, int src_stride, + unsigned int height, int *sum, + unsigned int *sumsquared); +void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride, + const unsigned char *src, int src_stride, + unsigned int height, int *sum, + unsigned int *sumsquared); + +uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src, + int src_stride, + const unsigned char *dst, + int dst_stride, uint32_t *sse) { + int xsum0; + unsigned int xxsum0; + + aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + assert(xsum0 <= 255 * 16 * 16); + assert(xsum0 >= -255 * 16 * 16); + return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); +} + +uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src, + int src_stride, + const unsigned char *dst, + int dst_stride, uint32_t *sse) { + int xsum0; + unsigned int xxsum0; + aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0, + &xxsum0); + + *sse = xxsum0; + assert(xsum0 <= 255 * 16 * 16); + assert(xsum0 >= -255 * 16 * 16); + return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); +} + +uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src, + int src_stride, + const unsigned char *dst, + int dst_stride, uint32_t *sse) { + int xsum0; + unsigned int xxsum0; + + aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + assert(xsum0 <= 255 * 16 * 16); + assert(xsum0 >= -255 * 16 * 16); + return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 000000000..7d96e26ae --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1151 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" + +#define CONV8_ROUNDING_BITS (7) + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +typedef enum { PACK_8x1, PACK_8x2, PACK_16x1 } PixelPackFormat; + +typedef void (*WritePixels)(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch); + +// ----------------------------------------------------------------------------- +// Copy and average + +void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + __m256i p0, p1, p2, p3, u0, u1, u2, u3; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); + u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); + _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); + + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + dst_stride), + _mm256_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadu_si128((const __m128i *)dst); + u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); + + _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); + _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadl_epi64((const __m128i *)dst); + u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); + + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); + _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } +} + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_pixels_with_format(const uint16_t *src, + PixelPackFormat fmt, + ptrdiff_t stride, __m256i *x) { + switch (fmt) { + case PACK_8x1: { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); + break; + } + case PACK_8x2: { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); + break; + } + case PACK_16x1: { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); + break; + } + default: { assert(0); } + } +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, const ptrdiff_t pitch, + __m256i *x /*x[4]*/) { + pack_pixels_with_format(src, PACK_8x1, pitch, x); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, const ptrdiff_t pitch, + __m256i *x /*x[8]*/) { + pack_pixels_with_format(src, PACK_8x2, pitch, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, const ptrdiff_t pitch, + __m256i *x /*x[8]*/) { + pack_pixels_with_format(src, PACK_16x1, pitch, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); +} + +static void write_8x1_pixels(const __m256i *y, const __m256i *z, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + (void)z; + (void)pitch; + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void write_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static void write_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t dst_pitch) { + (void)dst_pitch; + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void filter_block_width8_horiz( + const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1, + const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d8_h8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_horiz(src, src_pitch, write_8x1_pixels, write_8x2_pixels, + dst, dst_pitch, height, filter, bd); +} + +static void filter_block_width16_horiz(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + const WritePixels write_16x1, + uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, + int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void aom_highbd_filter_block1d16_h8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_horiz(src, src_pitch, write_16x1_pixels, dst, dst_pitch, + height, filter, bd); +} + +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static INLINE void filter_16x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void filter_block_width8_2t_horiz( + const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1, + const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_8x2_2t_pixels(signal, &ff, &res0, &res1); + write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d8_h2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_horiz(src, src_pitch, write_8x1_pixels, + write_8x2_pixels, dst, dst_pitch, height, filter, + bd); +} + +static void filter_block_width16_2t_horiz(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + const WritePixels write_16x1, + uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16x1_2t_pixels(signal, &ff, &res0, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void aom_highbd_filter_block1d16_h2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_horiz(src, src_pitch, write_16x1_pixels, dst, + dst_pitch, height, filter, bd); +} + +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static INLINE void write_8x1_pixels_ver(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)pitch; + const __m128i v0 = _mm256_castsi256_si128(*y0); + const __m128i v1 = _mm256_castsi256_si128(*y1); + __m128i p = _mm_packus_epi32(v0, v1); + p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, p); +} + +static void filter_block_width8_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, WritePixels write_8x1, + WritePixels write_8x2, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 1); + + if (height > 0) { + pack_8x9_pixels(src_ptr, src_pitch, signal); + filter_8x9_pixels(signal, ff, &res0, &res1); + write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d8_v8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_vert(src, src_pitch, write_8x1_pixels_ver, + write_8x2_pixels, dst, dst_pitch, height, filter, + bd); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); +} + +static INLINE void write_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void write_16x1_pixels_ver(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)y1; + (void)pitch; + const __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void filter_block_width16_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + WritePixels write_16x1, + WritePixels write_16x2, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + write_16x2(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 1); + + if (height > 0) { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d16_v8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_vert(src, src_pitch, write_16x1_pixels_ver, + write_16x2_pixels, dst, dst_pitch, height, filter, + bd); +} + +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void filter_block_width16_2t_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + WritePixels write_16x1, + uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, + int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void aom_highbd_filter_block1d16_v2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_vert(src, src_pitch, write_16x1_pixels, dst, + dst_pitch, height, filter, bd); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static void write_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +typedef void (*Write8Pixels)(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst); + +static void filter_block_width8_2t_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + Write8Pixels write_8x1, + uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, + int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + write_8x1(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void aom_highbd_filter_block1d8_v2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_pixels_ver, dst, + dst_pitch, height, filter, bd); +} + +// Calculation with averaging the input pixels + +static void write_8x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)y1; + (void)pitch; + const __m128i a0 = _mm256_castsi256_si128(*y0); + const __m128i a1 = _mm256_extractf128_si256(*y0, 1); + __m128i res = _mm_packus_epi32(a0, a1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void write_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); + const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); + const __m256i pix = + _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static void write_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)pitch; + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static INLINE void write_8x1_avg_pixels_ver(const __m256i *y0, + const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)pitch; + const __m128i v0 = _mm256_castsi256_si128(*y0); + const __m128i v1 = _mm256_castsi256_si128(*y1); + __m128i p = _mm_packus_epi32(v0, v1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask)); + p = _mm_avg_epu16(p, pix); + _mm_storeu_si128((__m128i *)dst, p); +} + +static INLINE void write_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); + const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); + __m256i p = _mm256_min_epi16(*y0, *mask); + p = _mm256_avg_epu16(p, pix0); + _mm256_storeu_si256((__m256i *)dst, p); + + p = _mm256_min_epi16(*y1, *mask); + p = _mm256_avg_epu16(p, pix1); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void write_16x1_avg_pixels_ver(const __m256i *y0, + const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)y1; + (void)pitch; + __m256i p = _mm256_min_epi16(*y0, *mask); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + p = _mm256_avg_epu16(p, pix); + _mm256_storeu_si256((__m256i *)dst, p); +} + +static void write_8x1_2t_avg_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, *mask); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void aom_highbd_filter_block1d8_h8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_horiz(src, src_pitch, write_8x1_avg_pixels, + write_8x2_avg_pixels, dst, dst_pitch, height, + filter, bd); +} + +static void aom_highbd_filter_block1d16_h8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_horiz(src, src_pitch, write_16x1_avg_pixels, dst, + dst_pitch, height, filter, bd); +} + +static void aom_highbd_filter_block1d8_v8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_vert(src, src_pitch, write_8x1_avg_pixels_ver, + write_8x2_avg_pixels, dst, dst_pitch, height, filter, + bd); +} + +static void aom_highbd_filter_block1d16_v8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_vert(src, src_pitch, write_16x1_avg_pixels_ver, + write_16x2_avg_pixels, dst, dst_pitch, height, + filter, bd); +} + +// 2-tap averaging + +static void aom_highbd_filter_block1d8_h2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_horiz(src, src_pitch, write_8x1_avg_pixels, + write_8x2_avg_pixels, dst, dst_pitch, height, + filter, bd); +} + +static void aom_highbd_filter_block1d16_h2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_horiz(src, src_pitch, write_16x1_avg_pixels, dst, + dst_pitch, height, filter, bd); +} + +static void aom_highbd_filter_block1d16_v2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_vert(src, src_pitch, write_16x1_avg_pixels, dst, + dst_pitch, height, filter, bd); +} + +static void aom_highbd_filter_block1d8_v2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_avg_pixels_ver, dst, + dst_pitch, height, filter, bd); +} + +typedef void HbdFilter1dFunc(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, + uint32_t, const int16_t *, int); + +#define HIGHBD_FUNC(width, dir, avg, opt) \ + aom_highbd_filter_block1d##width##_##dir##_##avg##opt + +HbdFilter1dFunc HIGHBD_FUNC(4, h8, , sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, h2, , sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v8, , sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v2, , sse2); + +#define aom_highbd_filter_block1d4_h8_avx2 HIGHBD_FUNC(4, h8, , sse2) +#define aom_highbd_filter_block1d4_h2_avx2 HIGHBD_FUNC(4, h2, , sse2) +#define aom_highbd_filter_block1d4_v8_avx2 HIGHBD_FUNC(4, v8, , sse2) +#define aom_highbd_filter_block1d4_v2_avx2 HIGHBD_FUNC(4, v2, , sse2) + +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +HIGH_FUN_CONV_2D(, avx2); + +HbdFilter1dFunc HIGHBD_FUNC(4, h8, avg_, sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, h2, avg_, sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v8, avg_, sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v2, avg_, sse2); + +#define aom_highbd_filter_block1d4_h8_avg_avx2 HIGHBD_FUNC(4, h8, avg_, sse2) +#define aom_highbd_filter_block1d4_h2_avg_avx2 HIGHBD_FUNC(4, h2, avg_, sse2) +#define aom_highbd_filter_block1d4_v8_avg_avx2 HIGHBD_FUNC(4, v8, avg_, sse2) +#define aom_highbd_filter_block1d4_v2_avg_avx2 HIGHBD_FUNC(4, v2, avg_, sse2) + +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + avx2); +HIGH_FUN_CONV_2D(avg_, avx2); + +#undef HIGHBD_FUNC diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm new file mode 100644 index 000000000..5d84ef8a7 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm @@ -0,0 +1,456 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_XMM sse2 +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + paddw m0, m2 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps + movd m1, [aboveq-2] + movq m0, [aboveq] + pshuflw m1, m1, 0x0 + movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 + movlhps m1, m1 ; tl tl tl tl tl tl tl tl + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m3, m3 + movd m4, bpsd + psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl + psllw m3, m4 + pcmpeqw m2, m2 + pxor m4, m4 ; min possible value + pxor m3, m2 ; max possible value + mova m1, [leftq] + pshuflw m2, m1, 0x0 + pshuflw m5, m1, 0x55 + movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + pshuflw m2, m1, 0xaa + pshuflw m5, m1, 0xff + movlhps m2, m5 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + mova m0, [aboveq] + pshuflw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m3, m3 + pxor m4, m4 + pinsrw m3, oned, 0 + pinsrw m4, bpsd, 0 + pshuflw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m3, m3 + mov lineq, -4 + mova m2, m3 + punpcklqdq m1, m1 + psllw m3, m4 + add leftq, 16 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movd m1, [leftq+lineq*4] + movd m2, [leftq+lineq*4+2] + pshuflw m1, m1, 0x0 + pshuflw m2, m2, 0x0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + mova [dstq ], m1 + mova [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps + movd m2, [aboveq-2] + mova m0, [aboveq] + mova m1, [aboveq+16] + pshuflw m2, m2, 0x0 + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m3, m3 + movd m4, bpsd + punpcklqdq m2, m2 + psllw m3, m4 + pcmpeqw m5, m5 + pxor m4, m4 ; min possible value + pxor m3, m5 ; max possible value + DEFINE_ARGS dst, stride, line, left + mov lineq, -8 + psubw m0, m2 + psubw m1, m2 +.loop: + movd m7, [leftq] + pshuflw m5, m7, 0x0 + pshuflw m2, m7, 0x55 + punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 + punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 + paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 + pminsw m6, m3 + pminsw m5, m3 + pmaxsw m6, m4 ; Clamp to the bit-depth + pmaxsw m5, m4 + mova [dstq ], m6 + mova [dstq +16], m5 + paddw m6, m2, m0 + paddw m2, m1 + pminsw m6, m3 + pminsw m2, m3 + pmaxsw m6, m4 + pmaxsw m2, m4 + mova [dstq+strideq*2 ], m6 + mova [dstq+strideq*2+16], m2 + lea dstq, [dstq+strideq*4] + inc lineq + lea leftq, [leftq+4] + + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps + movd m0, [aboveq-2] + mova m1, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + pshuflw m0, m0, 0x0 + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m5, m5 + movd m6, bpsd + psllw m5, m6 + pcmpeqw m7, m7 + pxor m6, m6 ; min possible value + pxor m5, m7 ; max possible value + punpcklqdq m0, m0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + psubw m1, m0 + psubw m2, m0 + psubw m3, m0 + psubw m4, m0 +.loop: + movd m7, [leftq] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +48], m0 + movd m7, [leftq+2] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2 ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+48], m0 + lea dstq, [dstq+strideq*4] + lea leftq, [leftq+4] + inc lineq + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c new file mode 100644 index 000000000..76369871b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -0,0 +1,1140 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { + __m128i ubounded; + __m128i lbounded; + __m128i retval; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + __m128i t80, max, min; + + if (bd == 8) { + t80 = _mm_set1_epi16(0x80); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); + } else if (bd == 10) { + t80 = _mm_set1_epi16(0x200); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); + } else { // bd == 12 + t80 = _mm_set1_epi16(0x800); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); + } + + min = _mm_subs_epi16(zero, t80); + + ubounded = _mm_cmpgt_epi16(value, max); + lbounded = _mm_cmplt_epi16(value, min); + retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); + ubounded = _mm_and_si128(ubounded, max); + lbounded = _mm_and_si128(lbounded, min); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_or_si128(retval, lbounded); + return retval; +} + +// TODO(debargha, peter): Break up large functions into smaller ones +// in this file. +void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + __m128i blimit, limit, thresh; + __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; + __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; + __m128i ps1, qs1, ps0, qs0; + __m128i abs_p0q0, abs_p1q1, ffff, work; + __m128i filt, work_a, filter1, filter2; + __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; + __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; + __m128i flat2_q0, flat2_p0; + __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3; + __m128i t4, t3, t80, t1; + __m128i eight, four; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + } + + q4 = _mm_load_si128((__m128i *)(s + 4 * p)); + p4 = _mm_load_si128((__m128i *)(s - 5 * p)); + q3 = _mm_load_si128((__m128i *)(s + 3 * p)); + p3 = _mm_load_si128((__m128i *)(s - 4 * p)); + q2 = _mm_load_si128((__m128i *)(s + 2 * p)); + p2 = _mm_load_si128((__m128i *)(s - 3 * p)); + q1 = _mm_load_si128((__m128i *)(s + 1 * p)); + p1 = _mm_load_si128((__m128i *)(s - 2 * p)); + q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + + // highbd_filter_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + + // highbd_hev_mask (in C code this is actually called from highbd_filter4) + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); // return ~mask + + // lp filter + // highbd_filter4 + t4 = _mm_set1_epi16(4); + t3 = _mm_set1_epi16(3); + if (bd == 8) + t80 = _mm_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm_set1_epi16(0x200); + else // bd == 12 + t80 = _mm_set1_epi16(0x800); + + t1 = _mm_set1_epi16(0x1); + + ps1 = _mm_subs_epi16(p1, t80); + qs1 = _mm_subs_epi16(q1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + + filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), + hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + filt = _mm_and_si128(filt, mask); + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + filter1 = _mm_srai_epi16(filter1, 0x3); + filter2 = _mm_srai_epi16(filter2, 0x3); + + qs0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); + ps0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(hev, filt); + qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); + + // end highbd_filter4 + // loopfilter done + + // highbd_flat_mask4 + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); + flat = _mm_max_epi16(work, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + // end flat_mask4 + + // flat & mask = flat && mask (as used in filter8) + // (because, in both vars, each block of 16 either all 1s or all 0s) + flat = _mm_and_si128(flat, mask); + + p5 = _mm_load_si128((__m128i *)(s - 6 * p)); + q5 = _mm_load_si128((__m128i *)(s + 5 * p)); + p6 = _mm_load_si128((__m128i *)(s - 7 * p)); + q6 = _mm_load_si128((__m128i *)(s + 6 * p)); + p7 = _mm_load_si128((__m128i *)(s - 8 * p)); + q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + + // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 + // but referred to as p0-p4 & q0-q4 in fn) + flat2 = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), + _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), + _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), + _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), + _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); + flat2 = _mm_max_epi16(work, flat2); + + if (bd == 8) + flat2 = _mm_subs_epu16(flat2, one); + else if (bd == 10) + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + + flat2 = _mm_cmpeq_epi16(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + // end highbd_flat_mask5 + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + eight = _mm_set1_epi16(8); + four = _mm_set1_epi16(4); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + flat2_p0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); + flat2_q0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); + flat_p0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); + flat_q0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); + + sum_p7 = _mm_add_epi16(p7, p7); + sum_q7 = _mm_add_epi16(q7, q7); + sum_p3 = _mm_add_epi16(p3, p3); + sum_q3 = _mm_add_epi16(q3, q3); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); + flat2_p1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); + flat2_q1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); + flat_p1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); + flat_q1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + sum_p3 = _mm_add_epi16(sum_p3, p3); + sum_q3 = _mm_add_epi16(sum_q3, q3); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); + flat2_p2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); + flat2_q2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); + flat_p2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); + flat_q2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); + flat2_p3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); + flat2_q3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); + flat2_p4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); + flat2_q4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); + flat2_p5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); + flat2_q5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); + flat2_p6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); + flat2_q6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + // highbd_filter8 + p2 = _mm_andnot_si128(flat, p2); + // p2 remains unchanged if !(flat && mask) + flat_p2 = _mm_and_si128(flat, flat_p2); + // when (flat && mask) + p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values + + ps1 = _mm_andnot_si128(flat, ps1); + // p1 takes the value assigned to in in filter4 if !(flat && mask) + flat_p1 = _mm_and_si128(flat, flat_p1); + // when (flat && mask) + p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values + qs1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values + + ps0 = _mm_andnot_si128(flat, ps0); + // p0 takes the value assigned to in in filter4 if !(flat && mask) + flat_p0 = _mm_and_si128(flat, flat_p0); + // when (flat && mask) + p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values + qs0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values + // end highbd_filter8 + + // highbd_filter16 + p6 = _mm_andnot_si128(flat2, p6); + // p6 remains unchanged if !(flat2 && flat && mask) + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + // get values for when (flat2 && flat && mask) + p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values + q6 = _mm_andnot_si128(flat2, q6); + // q6 remains unchanged if !(flat2 && flat && mask) + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + // get values for when (flat2 && flat && mask) + q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values + _mm_store_si128((__m128i *)(s - 7 * p), p6); + _mm_store_si128((__m128i *)(s + 6 * p), q6); + + p5 = _mm_andnot_si128(flat2, p5); + // p5 remains unchanged if !(flat2 && flat && mask) + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + // get values for when (flat2 && flat && mask) + p5 = _mm_or_si128(p5, flat2_p5); + // full list of p5 values + q5 = _mm_andnot_si128(flat2, q5); + // q5 remains unchanged if !(flat2 && flat && mask) + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + // get values for when (flat2 && flat && mask) + q5 = _mm_or_si128(q5, flat2_q5); + // full list of q5 values + _mm_store_si128((__m128i *)(s - 6 * p), p5); + _mm_store_si128((__m128i *)(s + 5 * p), q5); + + p4 = _mm_andnot_si128(flat2, p4); + // p4 remains unchanged if !(flat2 && flat && mask) + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + // get values for when (flat2 && flat && mask) + p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values + q4 = _mm_andnot_si128(flat2, q4); + // q4 remains unchanged if !(flat2 && flat && mask) + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + // get values for when (flat2 && flat && mask) + q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values + _mm_store_si128((__m128i *)(s - 5 * p), p4); + _mm_store_si128((__m128i *)(s + 4 * p), q4); + + p3 = _mm_andnot_si128(flat2, p3); + // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + // get values for when (flat2 && flat && mask) + p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values + q3 = _mm_andnot_si128(flat2, q3); + // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + // get values for when (flat2 && flat && mask) + q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values + _mm_store_si128((__m128i *)(s - 4 * p), p3); + _mm_store_si128((__m128i *)(s + 3 * p), q3); + + p2 = _mm_andnot_si128(flat2, p2); + // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + // get values for when (flat2 && flat && mask) + p2 = _mm_or_si128(p2, flat2_p2); + // full list of p2 values + q2 = _mm_andnot_si128(flat2, q2); + // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + // get values for when (flat2 && flat && mask) + q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values + _mm_store_si128((__m128i *)(s - 3 * p), p2); + _mm_store_si128((__m128i *)(s + 2 * p), q2); + + p1 = _mm_andnot_si128(flat2, p1); + // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + // get values for when (flat2 && flat && mask) + p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values + q1 = _mm_andnot_si128(flat2, q1); + // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + // get values for when (flat2 && flat && mask) + q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values + _mm_store_si128((__m128i *)(s - 2 * p), p1); + _mm_store_si128((__m128i *)(s + 1 * p), q1); + + p0 = _mm_andnot_si128(flat2, p0); + // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + // get values for when (flat2 && flat && mask) + p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values + q0 = _mm_andnot_si128(flat2, q0); + // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + // get values for when (flat2 && flat && mask) + q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values + _mm_store_si128((__m128i *)(s - 1 * p), p0); + _mm_store_si128((__m128i *)(s - 0 * p), q0); +} + +void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); + aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +} + +void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); + __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); + __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); + __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); + __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); + __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); + __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_shft; + + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + t80 = _mm_set1_epi16(0x80); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + t80 = _mm_set1_epi16(0x200); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + t80 = _mm_set1_epi16(0x800); + } + + ps1 = _mm_subs_epi16(p1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + qs1 = _mm_subs_epi16(q1, t80); + + // filter_mask and hev_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_max_epi16(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + mask = _mm_max_epi16(abs_q1q0, mask); + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); + + // flat_mask4 + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + flat = _mm_max_epi16(abs_p1p0, flat); + flat = _mm_max_epi16(abs_q1q0, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op2[0], workp_shft); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); + + // lp filter + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = signed_char_clamp_bd_sse2(filt, bd); + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi16(filt, t4); + filter2 = _mm_adds_epi16(filt, t3); + + // Filter1 >> 3 + filter1 = signed_char_clamp_bd_sse2(filter1, bd); + filter1 = _mm_srai_epi16(filter1, 3); + + // Filter2 >> 3 + filter2 = signed_char_clamp_bd_sse2(filter2, bd); + filter2 = _mm_srai_epi16(filter2, 3); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + filt = _mm_andnot_si128(hev, filt); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); + work_a = _mm_adds_epi16(work_a, t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); + work_a = _mm_adds_epi16(work_a, t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_store_si128((__m128i *)(s - 3 * p), p2); + _mm_store_si128((__m128i *)(s - 2 * p), p1); + _mm_store_si128((__m128i *)(s - 1 * p), p0); + _mm_store_si128((__m128i *)(s + 0 * p), q0); + _mm_store_si128((__m128i *)(s + 1 * p), q1); + _mm_store_si128((__m128i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_horizontal_8_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); +} + +void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + const __m128i zero = _mm_set1_epi16(0); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + const __m128i one = _mm_set1_epi16(1); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + __m128i work; + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + __m128i tff80; + __m128i tffe0; + __m128i t1f; + // equivalent to shifting 0x1f left by bitdepth - 8 + // and setting new bits to 1 + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i t7f; + // equivalent to shifting 0x7f left by bitdepth - 8 + // and setting new bits to 1 + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + t80 = _mm_set1_epi16(0x80); + tff80 = _mm_set1_epi16(0xff80); + tffe0 = _mm_set1_epi16(0xffe0); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); + } + + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + + // filter_mask and hev_mask + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_max_epi16(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); + + // filter4 + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm_and_si128(filter1, t1f); // clamp the range + filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, tffe0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + work_a = _mm_cmpgt_epi16(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, tff80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); + q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + p0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); + p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +void aom_highbd_lpf_horizontal_4_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); +} + +static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], + int out_p, int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; + do { + uint16_t *in = src[idx8x8]; + uint16_t *out = dst[idx8x8]; + + p0 = + _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + p1 = + _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + p2 = + _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + p3 = + _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + p4 = + _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + p5 = + _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + p6 = + _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + p7 = + _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 + x0 = _mm_unpacklo_epi16(p0, p1); + // 20 30 21 31 22 32 23 33 + x1 = _mm_unpacklo_epi16(p2, p3); + // 40 50 41 51 42 52 43 53 + x2 = _mm_unpacklo_epi16(p4, p5); + // 60 70 61 71 62 72 63 73 + x3 = _mm_unpacklo_epi16(p6, p7); + // 00 10 20 30 01 11 21 31 + x4 = _mm_unpacklo_epi32(x0, x1); + // 40 50 60 70 41 51 61 71 + x5 = _mm_unpacklo_epi32(x2, x3); + // 00 10 20 30 40 50 60 70 + x6 = _mm_unpacklo_epi64(x4, x5); + // 01 11 21 31 41 51 61 71 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); + // 00 10 20 30 40 50 60 70 + _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); + // 01 11 21 31 41 51 61 71 + + // 02 12 22 32 03 13 23 33 + x4 = _mm_unpackhi_epi32(x0, x1); + // 42 52 62 72 43 53 63 73 + x5 = _mm_unpackhi_epi32(x2, x3); + // 02 12 22 32 42 52 62 72 + x6 = _mm_unpacklo_epi64(x4, x5); + // 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); + // 02 12 22 32 42 52 62 72 + _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); + // 03 13 23 33 43 53 63 73 + + // 04 14 05 15 06 16 07 17 + x0 = _mm_unpackhi_epi16(p0, p1); + // 24 34 25 35 26 36 27 37 + x1 = _mm_unpackhi_epi16(p2, p3); + // 44 54 45 55 46 56 47 57 + x2 = _mm_unpackhi_epi16(p4, p5); + // 64 74 65 75 66 76 67 77 + x3 = _mm_unpackhi_epi16(p6, p7); + // 04 14 24 34 05 15 25 35 + x4 = _mm_unpacklo_epi32(x0, x1); + // 44 54 64 74 45 55 65 75 + x5 = _mm_unpacklo_epi32(x2, x3); + // 04 14 24 34 44 54 64 74 + x6 = _mm_unpacklo_epi64(x4, x5); + // 05 15 25 35 45 55 65 75 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); + // 04 14 24 34 44 54 64 74 + _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); + // 05 15 25 35 45 55 65 75 + + // 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi32(x0, x1); + // 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi32(x2, x3); + // 06 16 26 36 46 56 66 76 + x6 = _mm_unpacklo_epi64(x4, x5); + // 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); + // 06 16 26 36 46 56 66 76 + _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); + // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, + uint16_t *out, int out_p) { + uint16_t *src0[1]; + uint16_t *src1[1]; + uint16_t *dest0[1]; + uint16_t *dest1[1]; + src0[0] = in0; + src1[0] = in1; + dest0[0] = out; + dest1[0] = out + 8; + highbd_transpose(src0, in_p, dest0, out_p, 1); + highbd_transpose(src1, in_p, dest1, out_p, 1); +} + +void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, p, dst, 8, 1); + + // Loop filtering + aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, p, 1); +} + +void aom_highbd_lpf_vertical_4_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, p, dst, 8, 1); + + // Loop filtering + aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, p, 1); +} + +void aom_highbd_lpf_vertical_8_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); + uint16_t *src[2]; + uint16_t *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + highbd_transpose(src, p, dst, 8, 2); + + // Loop filtering + aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, + bd); + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + highbd_transpose(src, 8, dst, p, 2); +} + +void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[256]); + + // Transpose 16x16 + highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); + + // Transpose back + highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c new file mode 100644 index 000000000..3ee24ab16 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + __m128i zbins[2]; + __m128i nzbins[2]; + + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + (void)scan; + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + } + *eob_ptr = eob_i + 1; +} + +void aom_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + } + *eob_ptr = eob + 1; +} +#endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 000000000..0c7cb3998 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,290 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro HIGH_SADNXN4D 2 +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 000000000..8427b891c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,366 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 + + +; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 + +; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 + + +; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm new file mode 100644 index 000000000..797e9c1d4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -0,0 +1,1040 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd rax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp + add srcq, src_stridemp +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define block_height dword heightm + %define sec_str sec_stridemp + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, \ + sse, g_bilin_filter, g_pw_8 + %define block_height heightd + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define block_height heightd + %define sec_str sec_strideq + %else + %define block_height dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar block_height, 1 +%endif +%if %2 == 1 ; avg + shl sec_str, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [dstq] + mova m3, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [dstq] + mova m3, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [dstq] + mova m3, [dstq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [dstq] + mova m3, [dstq + dst_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [dstq] + mova m5, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [dstq] + mova m5, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [dstq] + mova m5, [dstq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [dstq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [dstq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c new file mode 100644 index 000000000..7bc8a0df3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, + ptrdiff_t pred_stride); + +static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); +} + +static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); + store_diff = (int64_t *)(diff + 4 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x4); + store_diff = (int64_t *)(diff + 5 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x5); + store_diff = (int64_t *)(diff + 6 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x6); + store_diff = (int64_t *)(diff + 7 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x7); +} + +static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); +} + +static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); + _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); + _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); + _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); + _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); +} + +static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 3; + src += src_stride << 3; + pred += pred_stride << 3; + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 8; + src += 8; + pred += 8; + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 3; + src += src_stride << 3; + pred += pred_stride << 3; + subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 4; + src += src_stride << 4; + pred += pred_stride << 4; + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 16; + src += 16; + pred += 16; + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 4; + src += src_stride << 4; + pred += pred_stride << 4; + subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 5; + src += src_stride << 5; + pred += pred_stride << 5; + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 32; + src += 32; + pred += 32; + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 5; + src += src_stride << 5; + pred += pred_stride << 5; + subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 6; + src += src_stride << 6; + pred += pred_stride << 6; + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 64; + src += 64; + pred += 64; + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 6; + src += src_stride << 6; + pred += pred_stride << 6; + subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { + SubtractWxHFuncType ret_func_ptr = NULL; + if (rows == 4) { + if (cols == 4) { + ret_func_ptr = subtract_4x4; + } else if (cols == 8) { + ret_func_ptr = subtract_8x4; + } + } else if (rows == 8) { + if (cols == 4) { + ret_func_ptr = subtract_4x8; + } else if (cols == 8) { + ret_func_ptr = subtract_8x8; + } else if (cols == 16) { + ret_func_ptr = subtract_16x8; + } + } else if (rows == 16) { + if (cols == 8) { + ret_func_ptr = subtract_8x16; + } else if (cols == 16) { + ret_func_ptr = subtract_16x16; + } else if (cols == 32) { + ret_func_ptr = subtract_32x16; + } + } else if (rows == 32) { + if (cols == 16) { + ret_func_ptr = subtract_16x32; + } else if (cols == 32) { + ret_func_ptr = subtract_32x32; + } else if (cols == 64) { + ret_func_ptr = subtract_64x32; + } + } else if (rows == 64) { + if (cols == 32) { + ret_func_ptr = subtract_32x64; + } else if (cols == 64) { + ret_func_ptr = subtract_64x64; + } else if (cols == 128) { + ret_func_ptr = subtract_128x64; + } + } else if (rows == 128) { + if (cols == 64) { + ret_func_ptr = subtract_64x128; + } else if (cols == 128) { + ret_func_ptr = subtract_128x128; + } + } + if (!ret_func_ptr) { + assert(0); + } + return ret_func_ptr; +} + +void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + SubtractWxHFuncType func; + (void)bd; + + func = getSubtractFunc(rows, cols); + func(diff, diff_stride, src, src_stride, pred, pred_stride); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 000000000..cf8ea498c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,316 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;unsigned int aom_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc16x16var_sse2) PRIVATE +sym(aom_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int aom_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc8x8var_sse2) PRIVATE +sym(aom_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 000000000..29f96ce24 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); +} + +#define HIGH_GET_VAR(S) \ + void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + } \ + \ + void aom_highbd_10_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + } \ + \ + void aom_highbd_12_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + } + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } \ + \ + uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + unsigned int *sse, void *unused0, void *unused); +#define DECLS(opt) \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); + +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + int64_t var; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ + NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ + &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused); +#define DECLS(opt1) \ + DECL(16, opt1) \ + DECL(8, opt1) + +DECLS(sse2); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + int64_t var; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt1) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ + FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt1, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN + +void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, + const uint8_t *ref8, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); + __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); + __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); + __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t2 = _mm_unpacklo_epi16(s4, s5); + t3 = _mm_unpacklo_epi16(s6, s7); + t0 = _mm_unpacklo_epi32(t0, t1); + t2 = _mm_unpacklo_epi32(t2, t3); + t0 = _mm_unpacklo_epi64(t0, t2); + + _mm_storeu_si128((__m128i *)(comp_pred), t0); + comp_pred += 8; + ref += 64; // 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t0 = _mm_unpacklo_epi32(t0, t1); + + _mm_storel_epi64((__m128i *)(comp_pred), t0); + comp_pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} + +void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride) { + const __m128i one = _mm_set1_epi16(1); + int i, j; + int stride = ref_stride << 3; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); + __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); + __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); + __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t2 = _mm_unpacklo_epi16(s4, s5); + t3 = _mm_unpacklo_epi16(s6, s7); + t0 = _mm_unpacklo_epi32(t0, t1); + t2 = _mm_unpacklo_epi32(t2, t3); + t0 = _mm_unpacklo_epi64(t0, t2); + + p0 = _mm_adds_epu16(t0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + + _mm_storeu_si128((__m128i *)(comp_pred), p0); + comp_pred += 8; + pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t0 = _mm_unpacklo_epi32(t0, t1); + + p0 = _mm_adds_epu16(t0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + + _mm_storel_epi64((__m128i *)(comp_pred), p0); + comp_pred += 4; + pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c new file mode 100644 index 000000000..cc7f52811 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include /* SSE4.1 */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/aom_filter.h" + +static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + uint64_t *sse, int64_t *sum) { + __m128i u0, u1, u2, u3; + __m128i s0, s1, s2, s3; + __m128i t0, t1, x0, y0; + __m128i a0, a1, a2, a3; + __m128i b0, b1, b2, b3; + __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); + a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); + a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); + a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); + + b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); + b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); + b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); + b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); + + u0 = _mm_unpacklo_epi16(a0, a1); + u1 = _mm_unpacklo_epi16(a2, a3); + u2 = _mm_unpacklo_epi16(b0, b1); + u3 = _mm_unpacklo_epi16(b2, b3); + + s0 = _mm_sub_epi16(u0, u2); + s1 = _mm_sub_epi16(u1, u3); + + t0 = _mm_madd_epi16(s0, k_one_epi16); + t1 = _mm_madd_epi16(s1, k_one_epi16); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + y0 = _mm_hadd_epi32(s3, s3); + + t0 = _mm_madd_epi16(s0, s0); + t1 = _mm_madd_epi16(s1, s1); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + x0 = _mm_hadd_epi32(s3, s3); + + *sse = (uint64_t)_mm_extract_epi32(x0, 0); + *sum = (int64_t)_mm_extract_epi32(y0, 0); +} + +uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)local_sse; + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); + sum = ROUND_POWER_OF_TWO(sum, 2); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); + sum = ROUND_POWER_OF_TWO(sum, 4); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return diff >= 0 ? (uint32_t)diff : 0; +} + +// Sub-pixel +uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +// Sub-pixel average + +uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm new file mode 100644 index 000000000..02567db49 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm @@ -0,0 +1,771 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x + punpcklbw m0, m1 + pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] + psrldq m0, 2 + psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] + movd m2, [leftq] + punpcklbw m2, m1 + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + pshuflw m4, m2, 0xaa + pshuflw m3, m2, 0xff + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movq m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] + pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] + DEFINE_ARGS dst, stride, line, left + mov lineq, -4 + punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] + psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] + movq m2, [leftq] + punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] +.loop: + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] + punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m3 + movq [dstq ], m4 + movhps [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m2, 4 + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left + pxor m1, m1 + mova m2, [aboveq-16]; + mova m0, [aboveq] ; t1 t2 ... t16 [byte] + punpckhbw m2, m1 ; [127:112] tl [word] + punpckhbw m4, m0, m1 + punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] + DEFINE_ARGS dst, stride, line, left, stride8 + mov lineq, -8 + pshufhw m2, m2, 0xff + mova m3, [leftq] ; l1 l2 ... l16 [byte] + punpckhqdq m2, m2 ; tl repeated 8 times [word] + psubw m0, m2 + psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] + punpckhbw m5, m3, m1 + punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] + lea stride8q, [strideq*8] +.loop: + pshuflw m6, m3, 0x0 + pshuflw m7, m5, 0x0 + punpcklqdq m6, m6 ; l1 repeated 8 times [word] + punpcklqdq m7, m7 ; l8 repeated 8 times [word] + paddw m1, m6, m0 + paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] + psrldq m5, 2 + packuswb m1, m6 + mova [dstq ], m1 + paddw m1, m7, m0 + paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] + psrldq m3, 2 + packuswb m1, m7 + mova [dstq+stride8q], m1 + inc lineq + lea dstq, [dstq+strideq] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + mova m4, [aboveq+16] + punpcklbw m2, m1 + punpckhbw m3, m0, m1 + punpckhbw m5, m4, m1 + punpcklbw m0, m1 + punpcklbw m4, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + punpcklqdq m2, m2 + add leftq, 32 + psubw m0, m2 + psubw m3, m2 + psubw m4, m2 + psubw m5, m2 +.loop: + movd m2, [leftq+lineq*2] + pxor m1, m1 + punpcklbw m2, m1 + pshuflw m7, m2, 0x55 + pshuflw m2, m2, 0x0 + punpcklqdq m2, m2 + punpcklqdq m7, m7 + paddw m6, m2, m3 + paddw m1, m2, m0 + packuswb m1, m6 + mova [dstq ], m1 + paddw m6, m2, m5 + paddw m1, m2, m4 + packuswb m1, m6 + mova [dstq+16 ], m1 + paddw m6, m7, m3 + paddw m1, m7, m0 + packuswb m1, m6 + mova [dstq+strideq ], m1 + paddw m6, m7, m5 + paddw m1, m7, m4 + packuswb m1, m6 + mova [dstq+strideq+16], m1 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm new file mode 100644 index 000000000..bc1bb2ff3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm @@ -0,0 +1,410 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; l1, l2, l3, l4 + movd m1, [aboveq-1] ; tl, t1, t2, t3 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 + ; comments below are for a predictor like this + ; A1 B1 C1 D1 + ; A2 B2 A1 B1 + ; A3 B3 A2 B2 + ; A4 B4 A3 B3 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 + + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+stride3q ], m3 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq*2], m3 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq ], m3 + psrldq m3, 2 ; A1 B1 C1 D1 .. + movd [dstq ], m3 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + movq m0, [leftq] ; [0- 7] l1-8 [byte] + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] + psrldq m4, m0, 1 ; t1-7 [word] + psrldq m5, m0, 2 ; t2-7 [word] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 + ; A2 B2 A1 B1 C1 D1 E1 F1 + ; A3 B3 A2 B2 A1 B1 C1 D1 + ; A4 B4 A3 B3 A2 B2 A1 B1 + ; A5 B5 A4 B4 A3 B3 A2 B2 + ; A6 B6 A5 B5 A4 B4 A3 B3 + ; A7 B7 A6 B6 A5 B5 A4 B4 + ; A8 B8 A7 B7 A6 B6 A5 B5 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 + + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 + movq [dstq+strideq*2], m0 + psrldq m0, 2 ; A-B2, A-B1, C-H1 + movq [dstq+strideq ], m0 + psrldq m0, 2 ; A-H1 + movq [dstq ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 + movq [dstq+strideq*2], m6 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 + movq [dstq+strideq ], m6 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 + movq [dstq ], m6 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m6, 15 + palignr m3, m0, m6, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] + pavgb m5, m0 ; A1 - Ag + + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 + + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + palignr m2, m1, m6, 14 + mova [dstq ], m2 + palignr m2, m1, m6, 12 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 10 + mova [dstq+strideq*2], m2 + palignr m2, m1, m6, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m1, m6, 6 + mova [dstq ], m2 + palignr m2, m1, m6, 4 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 2 + mova [dstq+strideq*2], m2 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + mova [dstq+stride3q ], m6 + lea dstq, [dstq+strideq*4] + + palignr m2, m6, m4, 14 + mova [dstq ], m2 + palignr m2, m6, m4, 12 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 10 + mova [dstq+strideq*2], m2 + palignr m2, m6, m4, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m6, m4, 6 + mova [dstq ], m2 + palignr m2, m6, m4, 4 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 2 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c new file mode 100644 index 000000000..5795a1845 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c @@ -0,0 +1,3631 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/inv_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#define RECON_AND_STORE4X4(dest, in_x) \ + { \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)(dest) = _mm_cvtsi128_si32(d0); \ + } + +void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16( + (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i input0, input1, input2, input3; + + // Rows + input0 = load_input_data(input); + input2 = load_input_data(input + 8); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input0 = _mm_shufflehi_epi16(input0, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input2, 0xd8); + + input1 = _mm_unpackhi_epi32(input0, input0); + input0 = _mm_unpacklo_epi32(input0, input0); + input3 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpacklo_epi32(input2, input2); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input1); + input1 = _mm_packs_epi32(input2, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_unpacklo_epi32(input2, input2); + input1 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpackhi_epi32(input3, input3); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input2); + input1 = _mm_packs_epi32(input1, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d2 = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, input2); + d2 = _mm_add_epi16(d2, input3); + d0 = _mm_packus_epi16(d0, d2); + // store input0 + *(int *)dest = _mm_cvtsi128_si32(d0); + // store input1 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store input2 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + // store input3 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + } +} + +void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE4X4(dest + 0 * stride, dc_value); + RECON_AND_STORE4X4(dest + 1 * stride, dc_value); + RECON_AND_STORE4X4(dest + 2 * stride, dc_value); + RECON_AND_STORE4X4(dest + 3 * stride, dc_value); +} + +void aom_idct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + + array_transpose_4x4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); +} + +void aom_iadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8], in7; + + array_transpose_4x4(in); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(v[3], v[4]); + u[2] = v[2]; + u[3] = _mm_add_epi32(u[0], u[1]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_add_epi32(u[3], v[5]); + u[6] = _mm_sub_epi32(u[5], u[4]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); +} + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ + res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + } + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ + out4, out5, out6, out7) \ + { \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ + stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ + stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + } + +void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + in4 = load_input_data(input + 8 * 4); + in5 = load_input_data(input + 8 * 5); + in6 = load_input_data(input + 8 * 6); + in7 = load_input_data(input + 8 * 7); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from aom_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, + in6, in7); + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest + 0 * stride, dc_value); + RECON_AND_STORE(dest + 1 * stride, dc_value); + RECON_AND_STORE(dest + 2 * stride, dc_value); + RECON_AND_STORE(dest + 3 * stride, dc_value); + RECON_AND_STORE(dest + 4 * stride, dc_value); + RECON_AND_STORE(dest + 5 * stride, dc_value); + RECON_AND_STORE(dest + 6 * stride, dc_value); + RECON_AND_STORE(dest + 7 * stride, dc_value); +} + +void aom_idct8_sse2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // 8x8 Transpose is copied from aom_fdct8x8_sse2() + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, + in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], + in[4], in[5], in[6], in[7]); +} + +void aom_iadst8_sse2(__m128i *in) { + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // transpose + array_transpose_8x8(in, in); + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); +} + +void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + stp1_5 = _mm_packs_epi32(tmp4, tmp6); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, tmp2); + stp2_2 = _mm_packs_epi32(tmp6, tmp4); + + tmp0 = _mm_adds_epi16(stp1_4, stp1_5); + tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + + tmp4 = _mm_adds_epi16(stp2_0, stp2_2); + tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); + stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + tmp0 = _mm_adds_epi16(stp1_3, stp2_4); + tmp1 = _mm_adds_epi16(stp1_2, stp1_5); + tmp2 = _mm_subs_epi16(stp1_3, stp2_4); + tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, + in5, in6, in7); + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +#define IDCT16 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ + stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ + stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ + stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ + stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + } + +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ + stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ + stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + } + +void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[16], l[16], r[16], *curr1; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + curr1 = l; + for (i = 0; i < 2; i++) { + // 1-D idct + + // Load input data. + in[0] = load_input_data(input); + in[8] = load_input_data(input + 8 * 1); + in[1] = load_input_data(input + 8 * 2); + in[9] = load_input_data(input + 8 * 3); + in[2] = load_input_data(input + 8 * 4); + in[10] = load_input_data(input + 8 * 5); + in[3] = load_input_data(input + 8 * 6); + in[11] = load_input_data(input + 8 * 7); + in[4] = load_input_data(input + 8 * 8); + in[12] = load_input_data(input + 8 * 9); + in[5] = load_input_data(input + 8 * 10); + in[13] = load_input_data(input + 8 * 11); + in[6] = load_input_data(input + 8 * 12); + in[14] = load_input_data(input + 8 * 13); + in[7] = load_input_data(input + 8 * 14); + in[15] = load_input_data(input + 8 * 15); + + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + + IDCT16 + + // Stage7 + curr1[0] = _mm_add_epi16(stp2_0, stp1_15); + curr1[1] = _mm_add_epi16(stp2_1, stp1_14); + curr1[2] = _mm_add_epi16(stp2_2, stp2_13); + curr1[3] = _mm_add_epi16(stp2_3, stp2_12); + curr1[4] = _mm_add_epi16(stp2_4, stp2_11); + curr1[5] = _mm_add_epi16(stp2_5, stp2_10); + curr1[6] = _mm_add_epi16(stp2_6, stp1_9); + curr1[7] = _mm_add_epi16(stp2_7, stp1_8); + curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); + curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); + curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); + curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); + curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); + curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); + curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); + curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); + + curr1 = r; + input += 128; + } + for (i = 0; i < 2; i++) { + int j; + // 1-D idct + array_transpose_8x8(l + i * 8, in); + array_transpose_8x8(r + i * 8, in + 8); + + IDCT16 + + // 2-D + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 16; ++i) { + RECON_AND_STORE(dest + 0, dc_value); + RECON_AND_STORE(dest + 8, dc_value); + dest += stride; + } +} + +void iadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +void idct16_8col(__m128i *in) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(t[0], t[1]); + u[1] = _mm_unpackhi_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_unpacklo_epi16(s[5], s[6]); + u[1] = _mm_unpackhi_epi16(s[5], s[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + t[5] = _mm_packs_epi32(u[0], u[1]); + t[6] = _mm_packs_epi32(u[2], u[3]); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +void aom_idct16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idct16_8col(in0); + idct16_8col(in1); +} + +void aom_iadst16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + iadst16_8col(in0); + iadst16_8col(in1); +} + +void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i in[16], l[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, + stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, + stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 2); + in[2] = load_input_data(input + 8 * 4); + in[3] = load_input_data(input + 8 * 6); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_11 = _mm_packs_epi32(tmp5, tmp7); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp1_0 = _mm_packs_epi32(tmp0, tmp0); + stp1_1 = _mm_packs_epi32(tmp2, tmp2); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp5, tmp7); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_6 = _mm_packs_epi32(tmp3, tmp1); + + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + int j; + array_transpose_4X8(l + 8 * i, in); + + IDCT16_10 + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +#define LOAD_DQCOEFF(reg, input) \ + { \ + reg = load_input_data(input); \ + input += 8; \ + } + +#define IDCT32_34 \ + /* Stage1 */ \ + { \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ + \ + const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ + \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ + stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ + stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ + stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ + stp1_24); \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ + stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ + stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ + stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ + stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ + stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ + stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ + stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ + } \ + \ + /* Stage7 */ \ + { \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ + stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } + +#define IDCT32(in0, in1) \ + /* Stage1 */ \ + { \ + const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]); \ + const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ + stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ + stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ + stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ + stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ + stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ + stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ + } \ + \ + /* Stage7 */ \ + { \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ + stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } + +// Only upper-left 8x8 has non-zero coeff +void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[32]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 32); + in[2] = load_input_data(input + 64); + in[3] = load_input_data(input + 96); + in[4] = load_input_data(input + 128); + in[5] = load_input_data(input + 160); + in[6] = load_input_data(input + 192); + in[7] = load_input_data(input + 224); + + for (i = 8; i < 32; ++i) { + in[i] = _mm_setzero_si128(); + } + + array_transpose_8x8(in, in); + // TODO(hkuang): Following transposes are unnecessary. But remove them will + // lead to performance drop on some devices. + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32_34 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1_0, stp1_31); + col[1] = _mm_add_epi16(stp1_1, stp1_30); + col[2] = _mm_add_epi16(stp1_2, stp1_29); + col[3] = _mm_add_epi16(stp1_3, stp1_28); + col[4] = _mm_add_epi16(stp1_4, stp1_27); + col[5] = _mm_add_epi16(stp1_5, stp1_26); + col[6] = _mm_add_epi16(stp1_6, stp1_25); + col[7] = _mm_add_epi16(stp1_7, stp1_24); + col[8] = _mm_add_epi16(stp1_8, stp1_23); + col[9] = _mm_add_epi16(stp1_9, stp1_22); + col[10] = _mm_add_epi16(stp1_10, stp1_21); + col[11] = _mm_add_epi16(stp1_11, stp1_20); + col[12] = _mm_add_epi16(stp1_12, stp1_19); + col[13] = _mm_add_epi16(stp1_13, stp1_18); + col[14] = _mm_add_epi16(stp1_14, stp1_17); + col[15] = _mm_add_epi16(stp1_15, stp1_16); + col[16] = _mm_sub_epi16(stp1_15, stp1_16); + col[17] = _mm_sub_epi16(stp1_14, stp1_17); + col[18] = _mm_sub_epi16(stp1_13, stp1_18); + col[19] = _mm_sub_epi16(stp1_12, stp1_19); + col[20] = _mm_sub_epi16(stp1_11, stp1_20); + col[21] = _mm_sub_epi16(stp1_10, stp1_21); + col[22] = _mm_sub_epi16(stp1_9, stp1_22); + col[23] = _mm_sub_epi16(stp1_8, stp1_23); + col[24] = _mm_sub_epi16(stp1_7, stp1_24); + col[25] = _mm_sub_epi16(stp1_6, stp1_25); + col[26] = _mm_sub_epi16(stp1_5, stp1_26); + col[27] = _mm_sub_epi16(stp1_4, stp1_27); + col[28] = _mm_sub_epi16(stp1_3, stp1_28); + col[29] = _mm_sub_epi16(stp1_2, stp1_29); + col[30] = _mm_sub_epi16(stp1_1, stp1_30); + col[31] = _mm_sub_epi16(stp1_0, stp1_31); + for (i = 0; i < 4; i++) { + int j; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + IDCT32_34 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[128], zero_idx[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j, i32; + + for (i = 0; i < 4; i++) { + i32 = (i << 5); + // First 1-D idct + // Load input data. + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); + + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); + + // checking if all entries are zero + zero_idx[0] = _mm_or_si128(in[0], in[1]); + zero_idx[1] = _mm_or_si128(in[2], in[3]); + zero_idx[2] = _mm_or_si128(in[4], in[5]); + zero_idx[3] = _mm_or_si128(in[6], in[7]); + zero_idx[4] = _mm_or_si128(in[8], in[9]); + zero_idx[5] = _mm_or_si128(in[10], in[11]); + zero_idx[6] = _mm_or_si128(in[12], in[13]); + zero_idx[7] = _mm_or_si128(in[14], in[15]); + zero_idx[8] = _mm_or_si128(in[16], in[17]); + zero_idx[9] = _mm_or_si128(in[18], in[19]); + zero_idx[10] = _mm_or_si128(in[20], in[21]); + zero_idx[11] = _mm_or_si128(in[22], in[23]); + zero_idx[12] = _mm_or_si128(in[24], in[25]); + zero_idx[13] = _mm_or_si128(in[26], in[27]); + zero_idx[14] = _mm_or_si128(in[28], in[29]); + zero_idx[15] = _mm_or_si128(in[30], in[31]); + + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); + + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); + + if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32(in, in + 16) + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } + for (i = 0; i < 4; i++) { + // Second 1-D idct + j = i << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + IDCT32(in, in + 16) + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, j; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + for (j = 0; j < 32; ++j) { + RECON_AND_STORE(dest + 0 + j * stride, dc_value); + RECON_AND_STORE(dest + 8 + j * stride, dc_value); + RECON_AND_STORE(dest + 16 + j * stride, dc_value); + RECON_AND_STORE(dest + 24 + j * stride, dc_value); + } +} + +// Apply a 32-element IDCT to 8 columns. This does not do any transposition +// of its input - the caller is expected to have done that. +// The input buffers are the top and bottom halves of an 8x32 block. +void idct32_8col(__m128i *in0, __m128i *in1) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + IDCT32(in0, in1) + + // 2_D: Calculate the results and store them to destination. + in0[0] = _mm_add_epi16(stp1_0, stp1_31); + in0[1] = _mm_add_epi16(stp1_1, stp1_30); + in0[2] = _mm_add_epi16(stp1_2, stp1_29); + in0[3] = _mm_add_epi16(stp1_3, stp1_28); + in0[4] = _mm_add_epi16(stp1_4, stp1_27); + in0[5] = _mm_add_epi16(stp1_5, stp1_26); + in0[6] = _mm_add_epi16(stp1_6, stp1_25); + in0[7] = _mm_add_epi16(stp1_7, stp1_24); + in0[8] = _mm_add_epi16(stp1_8, stp1_23); + in0[9] = _mm_add_epi16(stp1_9, stp1_22); + in0[10] = _mm_add_epi16(stp1_10, stp1_21); + in0[11] = _mm_add_epi16(stp1_11, stp1_20); + in0[12] = _mm_add_epi16(stp1_12, stp1_19); + in0[13] = _mm_add_epi16(stp1_13, stp1_18); + in0[14] = _mm_add_epi16(stp1_14, stp1_17); + in0[15] = _mm_add_epi16(stp1_15, stp1_16); + in1[0] = _mm_sub_epi16(stp1_15, stp1_16); + in1[1] = _mm_sub_epi16(stp1_14, stp1_17); + in1[2] = _mm_sub_epi16(stp1_13, stp1_18); + in1[3] = _mm_sub_epi16(stp1_12, stp1_19); + in1[4] = _mm_sub_epi16(stp1_11, stp1_20); + in1[5] = _mm_sub_epi16(stp1_10, stp1_21); + in1[6] = _mm_sub_epi16(stp1_9, stp1_22); + in1[7] = _mm_sub_epi16(stp1_8, stp1_23); + in1[8] = _mm_sub_epi16(stp1_7, stp1_24); + in1[9] = _mm_sub_epi16(stp1_6, stp1_25); + in1[10] = _mm_sub_epi16(stp1_5, stp1_26); + in1[11] = _mm_sub_epi16(stp1_4, stp1_27); + in1[12] = _mm_sub_epi16(stp1_3, stp1_28); + in1[13] = _mm_sub_epi16(stp1_2, stp1_29); + in1[14] = _mm_sub_epi16(stp1_1, stp1_30); + in1[15] = _mm_sub_epi16(stp1_0, stp1_31); +} + +#if CONFIG_HIGHBITDEPTH +static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { + __m128i ubounded, retval; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + ubounded = _mm_cmpgt_epi16(value, max); + retval = _mm_andnot_si128(ubounded, value); + ubounded = _mm_and_si128(ubounded, max); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); + return retval; +} + +void aom_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + __m128i inptr[4]; + __m128i sign_bits[2]; + __m128i temp_mm, min_input, max_input; + int test; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int optimised_cols = 0; + const __m128i zero = _mm_set1_epi16(0); + const __m128i eight = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(12043); + const __m128i min = _mm_set1_epi16(-12043); + // Load input into __m128i + inptr[0] = _mm_loadu_si128((const __m128i *)input); + inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); + inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); + inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); + + // Pack to 16 bits + inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); + inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); + + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (!test) { + // Do the row transform + aom_idct4_sse2(inptr); + + // Check the min & max values + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (test) { + array_transpose_4x4(inptr); + sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); + sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); + inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); + inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); + inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); + inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); + _mm_storeu_si128((__m128i *)outptr, inptr[0]); + _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); + _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); + _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + aom_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + } + + if (optimised_cols) { + aom_idct4_sse2(inptr); + + // Final round and shift + inptr[0] = _mm_add_epi16(inptr[0], eight); + inptr[1] = _mm_add_epi16(inptr[1], eight); + + inptr[0] = _mm_srai_epi16(inptr[0], 4); + inptr[1] = _mm_srai_epi16(inptr[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); + __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi64( + d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); + d2 = _mm_unpacklo_epi64( + d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); + d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); + d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); + // store input0 + _mm_storel_epi64((__m128i *)dest, d0); + // store input1 + d0 = _mm_srli_si128(d0, 8); + _mm_storel_epi64((__m128i *)(dest + stride), d0); + // store input2 + _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); + // store input3 + d2 = _mm_srli_si128(d2, 8); + _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[4], temp_out[4]; + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + aom_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } + } +} + +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h new file mode 100644 index 000000000..95d246c3c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_ +#define AOM_DSP_X86_INV_TXFM_SSE2_H_ + +#include // SSE2 +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/inv_txfm.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +// perform 8x8 transpose +static INLINE void array_transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +// Function to allow 8 bit optimisations to be used when profile 0 is used with +// highbitdepth enabled +static INLINE __m128i load_input_data(const tran_low_t *data) { +#if CONFIG_HIGHBITDEPTH + return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7]); +#else + return _mm_load_si128((const __m128i *)data); +#endif +} + +static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { + in[0] = load_input_data(input + 0 * 16); + in[1] = load_input_data(input + 1 * 16); + in[2] = load_input_data(input + 2 * 16); + in[3] = load_input_data(input + 3 * 16); + in[4] = load_input_data(input + 4 * 16); + in[5] = load_input_data(input + 5 * 16); + in[6] = load_input_data(input + 6 * 16); + in[7] = load_input_data(input + 7 * 16); + + in[8] = load_input_data(input + 8 * 16); + in[9] = load_input_data(input + 9 * 16); + in[10] = load_input_data(input + 10 * 16); + in[11] = load_input_data(input + 11 * 16); + in[12] = load_input_data(input + 12 * 16); + in[13] = load_input_data(input + 13 * 16); + in[14] = load_input_data(input + 14 * 16); + in[15] = load_input_data(input + 15 * 16); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); + RECON_AND_STORE(dest + 8 * stride, in[8]); + RECON_AND_STORE(dest + 9 * stride, in[9]); + RECON_AND_STORE(dest + 10 * stride, in[10]); + RECON_AND_STORE(dest + 11 * stride, in[11]); + RECON_AND_STORE(dest + 12 * stride, in[12]); + RECON_AND_STORE(dest + 13 * stride, in[13]); + RECON_AND_STORE(dest + 14 * stride, in[14]); + RECON_AND_STORE(dest + 15 * stride, in[15]); +} + +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + } + +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + } + +void iadst16_8col(__m128i *in); +void idct16_8col(__m128i *in); +void aom_idct4_sse2(__m128i *in); +void aom_idct8_sse2(__m128i *in); +void aom_idct16_sse2(__m128i *in0, __m128i *in1); +void aom_iadst4_sse2(__m128i *in); +void aom_iadst8_sse2(__m128i *in); +void aom_iadst16_sse2(__m128i *in0, __m128i *in1); +void idct32_8col(__m128i *in0, __m128i *in1); + +#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c new file mode 100644 index 000000000..9d006797b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c @@ -0,0 +1,1333 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/inv_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + in4 = load_input_data(input + 8 * 4); + in5 = load_input_data(input + 8 * 5); + in6 = load_input_data(input + 8 * 6); + in7 = load_input_data(input + 8 * 7); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vpx_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + { + /* Stage1 */ + { + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); + + { + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp1 = _mm_madd_epi16(hi_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp3 = _mm_madd_epi16(hi_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp5 = _mm_madd_epi16(hi_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + tmp7 = _mm_madd_epi16(hi_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + tmp4 = _mm_srai_epi32(tmp4, 14); + tmp5 = _mm_srai_epi32(tmp5, 14); + tmp6 = _mm_srai_epi32(tmp6, 14); + tmp7 = _mm_srai_epi32(tmp7, 14); + + stp1_4 = _mm_packs_epi32(tmp0, tmp1); + stp1_7 = _mm_packs_epi32(tmp2, tmp3); + stp1_5 = _mm_packs_epi32(tmp4, tmp5); + stp1_6 = _mm_packs_epi32(tmp6, tmp7); + } + } + + /* Stage2 */ + { + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); + + { + tmp0 = _mm_unpacklo_epi16(in0, in4); + tmp1 = _mm_unpackhi_epi16(in0, in4); + + tmp2 = _mm_madd_epi16(tmp0, stk2_0); + tmp3 = _mm_madd_epi16(tmp1, stk2_0); + tmp4 = _mm_madd_epi16(tmp0, stk2_1); + tmp5 = _mm_madd_epi16(tmp1, stk2_1); + + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp2, tmp3); + stp2_1 = _mm_packs_epi32(tmp4, tmp5); + + tmp0 = _mm_madd_epi16(lo_26, stg2_2); + tmp1 = _mm_madd_epi16(hi_26, stg2_2); + tmp2 = _mm_madd_epi16(lo_26, stg2_3); + tmp3 = _mm_madd_epi16(hi_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + + stp2_2 = _mm_packs_epi32(tmp0, tmp1); + stp2_3 = _mm_packs_epi32(tmp2, tmp3); + } + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + } + + /* Stage3 */ + { + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); + tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); + + tmp2 = _mm_madd_epi16(tmp0, stk2_1); + tmp3 = _mm_madd_epi16(tmp1, stk2_1); + tmp4 = _mm_madd_epi16(tmp0, stk2_0); + tmp5 = _mm_madd_epi16(tmp1, stk2_0); + + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp2, tmp3); + stp1_6 = _mm_packs_epi32(tmp4, tmp5); + } + + /* Stage4 */ + in0 = _mm_add_epi16(stp1_0, stp2_7); + in1 = _mm_add_epi16(stp1_1, stp1_6); + in2 = _mm_add_epi16(stp1_2, stp1_5); + in3 = _mm_add_epi16(stp1_3, stp2_4); + in4 = _mm_sub_epi16(stp1_3, stp2_4); + in5 = _mm_sub_epi16(stp1_2, stp1_5); + in6 = _mm_sub_epi16(stp1_1, stp1_6); + in7 = _mm_sub_epi16(stp1_0, stp2_7); + } + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3; + + // Rows. Load 4-row input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + + // Stage1 + tmp0 = _mm_mulhrs_epi16(in0, stg1_0); + tmp1 = _mm_mulhrs_epi16(in0, stg1_1); + tmp2 = _mm_mulhrs_epi16(in1, stg1_2); + tmp3 = _mm_mulhrs_epi16(in1, stg1_3); + + stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); + stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); + + // Stage2 + tmp0 = _mm_mulhrs_epi16(in0, stg2_0); + stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); + + tmp1 = _mm_mulhrs_epi16(in1, stg2_2); + tmp2 = _mm_mulhrs_epi16(in1, stg2_3); + stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); + + tmp0 = _mm_add_epi16(stp1_4, stp1_5); + tmp1 = _mm_sub_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + + tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); + tmp1 = _mm_madd_epi16(tmp0, stg3_0); + tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp1, tmp2); + + // Stage3 + tmp2 = _mm_add_epi16(stp2_0, stp2_2); + tmp3 = _mm_sub_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); + stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); + + // Stage4 + tmp0 = _mm_add_epi16(stp1_3, stp2_4); + tmp1 = _mm_add_epi16(stp1_2, stp1_5); + tmp2 = _mm_sub_epi16(stp1_3, stp2_4); + tmp3 = _mm_sub_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + /* Stage1 */ + stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); + stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); + stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); + stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); + + /* Stage2 */ + stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); + stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); + + stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); + stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + /* Stage3 */ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); + tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); + + tmp2 = _mm_madd_epi16(tmp0, stk2_0); + tmp3 = _mm_madd_epi16(tmp1, stk2_0); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + tmp2 = _mm_madd_epi16(tmp0, stk2_1); + tmp3 = _mm_madd_epi16(tmp1, stk2_1); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + stp1_5 = _mm_packs_epi32(tmp2, tmp3); + + /* Stage4 */ + in0 = _mm_add_epi16(stp1_0, stp2_7); + in1 = _mm_add_epi16(stp1_1, stp1_6); + in2 = _mm_add_epi16(stp1_2, stp1_5); + in3 = _mm_add_epi16(stp1_3, stp2_4); + in4 = _mm_sub_epi16(stp1_3, stp2_4); + in5 = _mm_sub_epi16(stp1_2, stp1_5); + in6 = _mm_sub_epi16(stp1_1, stp1_6); + in7 = _mm_sub_epi16(stp1_0, stp2_7); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ + do { \ + tmp0 = _mm_madd_epi16(x0, co0); \ + tmp1 = _mm_madd_epi16(x1, co0); \ + tmp2 = _mm_madd_epi16(x0, co1); \ + tmp3 = _mm_madd_epi16(x1, co1); \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + } while (0) + +static INLINE void butterfly(const __m128i *x0, const __m128i *x1, + const __m128i *c0, const __m128i *c1, __m128i *y0, + __m128i *y1) { + __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm_unpacklo_epi16(*x0, *x1); + u1 = _mm_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *y0 = _mm_packs_epi32(tmp0, tmp1); + *y1 = _mm_packs_epi32(tmp2, tmp3); +} + +static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, + const __m128i *c1) { + __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm_unpacklo_epi16(*x0, *x1); + u1 = _mm_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *x0 = _mm_packs_epi32(tmp0, tmp1); + *x1 = _mm_packs_epi32(tmp2, tmp3); +} + +static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { + const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + + const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i x0, x1, x4, x5, x6, x7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + + // phase 1 + + // 0, 15 + u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15 + u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12 + v15 = _mm_add_epi16(u2, u3); + // in[0], in[4] + x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0] + x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7] + v0 = _mm_add_epi16(x0, x7); // stp2_0 + stp1[0] = _mm_add_epi16(v0, v15); + stp1[15] = _mm_sub_epi16(v0, v15); + + // in[2], in[6] + u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8 + u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11 + butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 + butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 + + v8 = _mm_add_epi16(u0, u1); + v9 = _mm_add_epi16(u4, u6); + v10 = _mm_sub_epi16(u4, u6); + v11 = _mm_sub_epi16(u0, u1); + v12 = _mm_sub_epi16(u2, u3); + v13 = _mm_sub_epi16(u5, u7); + v14 = _mm_add_epi16(u5, u7); + + butterfly_self(&v10, &v13, &stg6_0, &stg4_0); + butterfly_self(&v11, &v12, &stg6_0, &stg4_0); + + // 1, 14 + x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 + // stp1[2] = stp1[0], stp1[3] = stp1[1] + x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4] + butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); + v1 = _mm_add_epi16(x1, x6); // stp2_1 + v2 = _mm_add_epi16(x0, x5); // stp2_2 + stp1[1] = _mm_add_epi16(v1, v14); + stp1[14] = _mm_sub_epi16(v1, v14); + + stp1[2] = _mm_add_epi16(v2, v13); + stp1[13] = _mm_sub_epi16(v2, v13); + + v3 = _mm_add_epi16(x1, x4); // stp2_3 + v4 = _mm_sub_epi16(x1, x4); // stp2_4 + + v5 = _mm_sub_epi16(x0, x5); // stp2_5 + + v6 = _mm_sub_epi16(x1, x6); // stp2_6 + v7 = _mm_sub_epi16(x0, x7); // stp2_7 + stp1[3] = _mm_add_epi16(v3, v12); + stp1[12] = _mm_sub_epi16(v3, v12); + + stp1[6] = _mm_add_epi16(v6, v9); + stp1[9] = _mm_sub_epi16(v6, v9); + + stp1[7] = _mm_add_epi16(v7, v8); + stp1[8] = _mm_sub_epi16(v7, v8); + + stp1[4] = _mm_add_epi16(v4, v11); + stp1[11] = _mm_sub_epi16(v4, v11); + + stp1[5] = _mm_add_epi16(v5, v10); + stp1[10] = _mm_sub_epi16(v5, v10); +} + +static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { + const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i v16, v17, v18, v19, v20, v21, v22, v23; + __m128i v24, v25, v26, v27, v28, v29, v30, v31; + __m128i u16, u17, u18, u19, u20, u21, u22, u23; + __m128i u24, u25, u26, u27, u28, u29, u30, u31; + + v16 = _mm_mulhrs_epi16(in[1], stk1_0); + v31 = _mm_mulhrs_epi16(in[1], stk1_1); + + v19 = _mm_mulhrs_epi16(in[7], stk1_6); + v28 = _mm_mulhrs_epi16(in[7], stk1_7); + + v20 = _mm_mulhrs_epi16(in[5], stk1_8); + v27 = _mm_mulhrs_epi16(in[5], stk1_9); + + v23 = _mm_mulhrs_epi16(in[3], stk1_14); + v24 = _mm_mulhrs_epi16(in[3], stk1_15); + + butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); + butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); + butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); + butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); + + u16 = _mm_add_epi16(v16, v19); + u17 = _mm_add_epi16(v17, v18); + u18 = _mm_sub_epi16(v17, v18); + u19 = _mm_sub_epi16(v16, v19); + u20 = _mm_sub_epi16(v23, v20); + u21 = _mm_sub_epi16(v22, v21); + u22 = _mm_add_epi16(v22, v21); + u23 = _mm_add_epi16(v23, v20); + u24 = _mm_add_epi16(v24, v27); + u27 = _mm_sub_epi16(v24, v27); + u25 = _mm_add_epi16(v25, v26); + u26 = _mm_sub_epi16(v25, v26); + u28 = _mm_sub_epi16(v31, v28); + u31 = _mm_add_epi16(v28, v31); + u29 = _mm_sub_epi16(v30, v29); + u30 = _mm_add_epi16(v29, v30); + + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + + stp1[16] = _mm_add_epi16(u16, u23); + stp1[23] = _mm_sub_epi16(u16, u23); + + stp1[17] = _mm_add_epi16(u17, u22); + stp1[22] = _mm_sub_epi16(u17, u22); + + stp1[18] = _mm_add_epi16(u18, u21); + stp1[21] = _mm_sub_epi16(u18, u21); + + stp1[19] = _mm_add_epi16(u19, u20); + stp1[20] = _mm_sub_epi16(u19, u20); + + stp1[24] = _mm_sub_epi16(u31, u24); + stp1[31] = _mm_add_epi16(u24, u31); + + stp1[25] = _mm_sub_epi16(u30, u25); + stp1[30] = _mm_add_epi16(u25, u30); + + stp1[26] = _mm_sub_epi16(u29, u26); + stp1[29] = _mm_add_epi16(u26, u29); + + stp1[27] = _mm_sub_epi16(u28, u27); + stp1[28] = _mm_add_epi16(u27, u28); + + butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0); + butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0); + butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0); + butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0); +} + +// Only upper-left 8x8 has non-zero coeff +void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i in[32], col[32]; + __m128i stp1[32]; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 32); + in[2] = load_input_data(input + 64); + in[3] = load_input_data(input + 96); + in[4] = load_input_data(input + 128); + in[5] = load_input_data(input + 160); + in[6] = load_input_data(input + 192); + in[7] = load_input_data(input + 224); + + array_transpose_8x8(in, in); + idct32_34_first_half(in, stp1); + idct32_34_second_half(in, stp1); + + // 1_D: Store 32 intermediate results for each 8x32 block. + add_sub_butterfly(stp1, col, 32); + for (i = 0; i < 4; i++) { + int j; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + idct32_34_first_half(in, stp1); + idct32_34_second_half(in, stp1); + + // 2_D: Calculate the results and store them to destination. + add_sub_butterfly(stp1, in, 32); + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +// in0[16] represents the left 8x16 block +// in1[16] represents the right 8x16 block +static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, + __m128i *in1) { + int i; + for (i = 0; i < 16; i++) { + in0[i] = load_input_data(input); + in1[i] = load_input_data(input + 8); + input += 32; + } +} + +static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, + __m128i *out1) { + array_transpose_8x8(in0, out0); + array_transpose_8x8(&in0[8], out1); + array_transpose_8x8(in1, &out0[8]); + array_transpose_8x8(&in1[8], &out1[8]); +} + +// Group the coefficient calculation into smaller functions +// to prevent stack spillover: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 +static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/, + __m128i *out /*out[8]*/) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + + { + const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + u0 = _mm_mulhrs_epi16(in[0], stk4_0); + u2 = _mm_mulhrs_epi16(in[8], stk4_2); + u3 = _mm_mulhrs_epi16(in[8], stk4_3); + u1 = u0; + } + + v0 = _mm_add_epi16(u0, u3); + v1 = _mm_add_epi16(u1, u2); + v2 = _mm_sub_epi16(u1, u2); + v3 = _mm_sub_epi16(u0, u3); + + { + const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + u4 = _mm_mulhrs_epi16(in[4], stk3_0); + u7 = _mm_mulhrs_epi16(in[4], stk3_1); + u5 = _mm_mulhrs_epi16(in[12], stk3_2); + u6 = _mm_mulhrs_epi16(in[12], stk3_3); + } + + v4 = _mm_add_epi16(u4, u5); + v5 = _mm_sub_epi16(u4, u5); + v6 = _mm_sub_epi16(u7, u6); + v7 = _mm_add_epi16(u7, u6); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); + } + + out[0] = _mm_add_epi16(v0, v7); + out[1] = _mm_add_epi16(v1, v6); + out[2] = _mm_add_epi16(v2, v5); + out[3] = _mm_add_epi16(v3, v4); + out[4] = _mm_sub_epi16(v3, v4); + out[5] = _mm_sub_epi16(v2, v5); + out[6] = _mm_sub_epi16(v1, v6); + out[7] = _mm_sub_epi16(v0, v7); +} + +static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, + __m128i *out /*out[8]*/) { + __m128i u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v8, v9, v10, v11, v12, v13, v14, v15; + + { + const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); + const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); + const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); + const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); + const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + u8 = _mm_mulhrs_epi16(in[2], stk2_0); + u15 = _mm_mulhrs_epi16(in[2], stk2_1); + u9 = _mm_mulhrs_epi16(in[14], stk2_2); + u14 = _mm_mulhrs_epi16(in[14], stk2_3); + u10 = _mm_mulhrs_epi16(in[10], stk2_4); + u13 = _mm_mulhrs_epi16(in[10], stk2_5); + u11 = _mm_mulhrs_epi16(in[6], stk2_6); + u12 = _mm_mulhrs_epi16(in[6], stk2_7); + } + + v8 = _mm_add_epi16(u8, u9); + v9 = _mm_sub_epi16(u8, u9); + v10 = _mm_sub_epi16(u11, u10); + v11 = _mm_add_epi16(u11, u10); + v12 = _mm_add_epi16(u12, u13); + v13 = _mm_sub_epi16(u12, u13); + v14 = _mm_sub_epi16(u15, u14); + v15 = _mm_add_epi16(u15, u14); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(v8, v11); + out[1] = _mm_add_epi16(v9, v10); + out[2] = _mm_sub_epi16(v9, v10); + out[3] = _mm_sub_epi16(v8, v11); + out[4] = _mm_sub_epi16(v15, v12); + out[5] = _mm_sub_epi16(v14, v13); + out[6] = _mm_add_epi16(v14, v13); + out[7] = _mm_add_epi16(v15, v12); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); + } +} + +// 8x32 block even indexed 8 inputs of in[16], +// output first half 16 to out[32] +static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/, + __m128i *out /*out[32]*/) { + __m128i temp[16]; + idct32_8x32_135_quarter_1(in, temp); + idct32_8x32_135_quarter_2(in, &temp[8]); + add_sub_butterfly(temp, out, 16); +} + +// 8x32 block odd indexed 8 inputs of in[16], +// output second half 16 to out[32] +static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, + __m128i *out /*out[32]*/) { + __m128i v16, v17, v18, v19, v20, v21, v22, v23; + __m128i v24, v25, v26, v27, v28, v29, v30, v31; + __m128i u16, u17, u18, u19, u20, u21, u22, u23; + __m128i u24, u25, u26, u27, u28, u29, u30, u31; + + { + const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); + const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); + + const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); + const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); + const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); + const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); + + const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); + const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); + const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + u16 = _mm_mulhrs_epi16(in[1], stk1_0); + u31 = _mm_mulhrs_epi16(in[1], stk1_1); + u17 = _mm_mulhrs_epi16(in[15], stk1_2); + u30 = _mm_mulhrs_epi16(in[15], stk1_3); + + u18 = _mm_mulhrs_epi16(in[9], stk1_4); + u29 = _mm_mulhrs_epi16(in[9], stk1_5); + u19 = _mm_mulhrs_epi16(in[7], stk1_6); + u28 = _mm_mulhrs_epi16(in[7], stk1_7); + + u20 = _mm_mulhrs_epi16(in[5], stk1_8); + u27 = _mm_mulhrs_epi16(in[5], stk1_9); + u21 = _mm_mulhrs_epi16(in[11], stk1_10); + u26 = _mm_mulhrs_epi16(in[11], stk1_11); + + u22 = _mm_mulhrs_epi16(in[13], stk1_12); + u25 = _mm_mulhrs_epi16(in[13], stk1_13); + u23 = _mm_mulhrs_epi16(in[3], stk1_14); + u24 = _mm_mulhrs_epi16(in[3], stk1_15); + } + + v16 = _mm_add_epi16(u16, u17); + v17 = _mm_sub_epi16(u16, u17); + v18 = _mm_sub_epi16(u19, u18); + v19 = _mm_add_epi16(u19, u18); + + v20 = _mm_add_epi16(u20, u21); + v21 = _mm_sub_epi16(u20, u21); + v22 = _mm_sub_epi16(u23, u22); + v23 = _mm_add_epi16(u23, u22); + + v24 = _mm_add_epi16(u24, u25); + v25 = _mm_sub_epi16(u24, u25); + v26 = _mm_sub_epi16(u27, u26); + v27 = _mm_add_epi16(u27, u26); + + v28 = _mm_add_epi16(u28, u29); + v29 = _mm_sub_epi16(u28, u29); + v30 = _mm_sub_epi16(u31, u30); + v31 = _mm_add_epi16(u31, u30); + + { + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); + } + + u16 = _mm_add_epi16(v16, v19); + u17 = _mm_add_epi16(v17, v18); + u18 = _mm_sub_epi16(v17, v18); + u19 = _mm_sub_epi16(v16, v19); + u20 = _mm_sub_epi16(v23, v20); + u21 = _mm_sub_epi16(v22, v21); + u22 = _mm_add_epi16(v22, v21); + u23 = _mm_add_epi16(v23, v20); + + u24 = _mm_add_epi16(v24, v27); + u25 = _mm_add_epi16(v25, v26); + u26 = _mm_sub_epi16(v25, v26); + u27 = _mm_sub_epi16(v24, v27); + u28 = _mm_sub_epi16(v31, v28); + u29 = _mm_sub_epi16(v30, v29); + u30 = _mm_add_epi16(v29, v30); + u31 = _mm_add_epi16(v28, v31); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(u16, u23); + out[1] = _mm_add_epi16(u17, u22); + out[2] = _mm_add_epi16(u18, u21); + out[3] = _mm_add_epi16(u19, u20); + v20 = _mm_sub_epi16(u19, u20); + v21 = _mm_sub_epi16(u18, u21); + v22 = _mm_sub_epi16(u17, u22); + v23 = _mm_sub_epi16(u16, u23); + + v24 = _mm_sub_epi16(u31, u24); + v25 = _mm_sub_epi16(u30, u25); + v26 = _mm_sub_epi16(u29, u26); + v27 = _mm_sub_epi16(u28, u27); + out[12] = _mm_add_epi16(u27, u28); + out[13] = _mm_add_epi16(u26, u29); + out[14] = _mm_add_epi16(u25, u30); + out[15] = _mm_add_epi16(u24, u31); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); + butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); + butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); + butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); + } +} + +// 8x16 block, input __m128i in[16], output __m128i in[32] +static void idct32_8x32_135(__m128i *in /*in[32]*/) { + __m128i out[32]; + idct32_8x32_quarter_1_2(in, out); + idct32_8x32_quarter_3_4(in, &out[16]); + add_sub_butterfly(out, in, 32); +} + +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + RECON_AND_STORE(dst, in[j]); + dst += stride; + RECON_AND_STORE(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, + int stride) { + store_buffer_8x32(in0, dest, stride); + store_buffer_8x32(in1, dest + 8, stride); +} + +static INLINE void idct32_135(__m128i *col0, __m128i *col1) { + idct32_8x32_135(col0); + idct32_8x32_135(col1); +} + +typedef enum { left_16, right_16 } ColsIndicator; + +static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, + ColsIndicator cols) { + switch (cols) { + case left_16: { + int i; + array_transpose_16x16(in0, in1); + for (i = 0; i < 16; ++i) { + store[i] = in0[16 + i]; + store[16 + i] = in1[16 + i]; + } + break; + } + case right_16: { + array_transpose_16x16_2(store, &store[16], in0, in1); + break; + } + default: { assert(0); } + } +} + +// Only upper-left 16x16 has non-zero coeff +void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + // Each array represents an 8x32 block + __m128i col0[32], col1[32]; + // This array represents a 16x16 block + __m128i temp[32]; + + // Load input data. Only need to load the top left 16x16 block. + load_buffer_16x16(input, col0, col1); + + // columns + array_transpose_16x16(col0, col1); + idct32_135(col0, col1); + + // rows + transpose_and_copy_16x16(col0, col1, temp, left_16); + idct32_135(col0, col1); + recon_and_store(col0, col1, dest, stride); + + transpose_and_copy_16x16(col0, col1, temp, right_16); + idct32_135(col0, col1); + recon_and_store(col0, col1, dest + 16, stride); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i in[32] +static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ + __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ + + { + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); + butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); + } + + v8 = _mm_add_epi16(u8, u9); + v9 = _mm_sub_epi16(u8, u9); + v14 = _mm_sub_epi16(u15, u14); + v15 = _mm_add_epi16(u15, u14); + + { + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); + butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); + } + + v10 = _mm_sub_epi16(u11, u10); + v11 = _mm_add_epi16(u11, u10); + v12 = _mm_add_epi16(u12, u13); + v13 = _mm_sub_epi16(u12, u13); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(v8, v11); + out[1] = _mm_add_epi16(v9, v10); + out[6] = _mm_add_epi16(v14, v13); + out[7] = _mm_add_epi16(v15, v12); + + out[2] = _mm_sub_epi16(v9, v10); + out[3] = _mm_sub_epi16(v8, v11); + out[4] = _mm_sub_epi16(v15, v12); + out[5] = _mm_sub_epi16(v14, v13); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); + } +} + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i in[32] +static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, + __m128i *out /*out[8]*/) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ + __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ + + { + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); + butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); + } + + v4 = _mm_add_epi16(u4, u5); + v5 = _mm_sub_epi16(u4, u5); + v6 = _mm_sub_epi16(u7, u6); + v7 = _mm_add_epi16(u7, u6); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); + + butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); + butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); + } + + v0 = _mm_add_epi16(u0, u3); + v1 = _mm_add_epi16(u1, u2); + v2 = _mm_sub_epi16(u1, u2); + v3 = _mm_sub_epi16(u0, u3); + + out[0] = _mm_add_epi16(v0, v7); + out[1] = _mm_add_epi16(v1, v6); + out[2] = _mm_add_epi16(v2, v5); + out[3] = _mm_add_epi16(v3, v4); + out[4] = _mm_sub_epi16(v3, v4); + out[5] = _mm_sub_epi16(v2, v5); + out[6] = _mm_sub_epi16(v1, v6); + out[7] = _mm_sub_epi16(v0, v7); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i in[32] +// We avoid hide an offset, 16, inside this function. So we output 0-15 into +// array out[16] +static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i v16, v17, v18, v19, v20, v21, v22, v23; + __m128i v24, v25, v26, v27, v28, v29, v30, v31; + __m128i u16, u17, u18, u19, u20, u21, u22, u23; + __m128i u24, u25, u26, u27, u28, u29, u30, u31; + + { + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); + butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); + butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); + butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); + + butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); + butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); + + butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); + butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); + } + + v16 = _mm_add_epi16(u16, u17); + v17 = _mm_sub_epi16(u16, u17); + v18 = _mm_sub_epi16(u19, u18); + v19 = _mm_add_epi16(u19, u18); + + v20 = _mm_add_epi16(u20, u21); + v21 = _mm_sub_epi16(u20, u21); + v22 = _mm_sub_epi16(u23, u22); + v23 = _mm_add_epi16(u23, u22); + + v24 = _mm_add_epi16(u24, u25); + v25 = _mm_sub_epi16(u24, u25); + v26 = _mm_sub_epi16(u27, u26); + v27 = _mm_add_epi16(u27, u26); + + v28 = _mm_add_epi16(u28, u29); + v29 = _mm_sub_epi16(u28, u29); + v30 = _mm_sub_epi16(u31, u30); + v31 = _mm_add_epi16(u31, u30); + + { + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); + } + + u16 = _mm_add_epi16(v16, v19); + u17 = _mm_add_epi16(v17, v18); + u18 = _mm_sub_epi16(v17, v18); + u19 = _mm_sub_epi16(v16, v19); + u20 = _mm_sub_epi16(v23, v20); + u21 = _mm_sub_epi16(v22, v21); + u22 = _mm_add_epi16(v22, v21); + u23 = _mm_add_epi16(v23, v20); + + u24 = _mm_add_epi16(v24, v27); + u25 = _mm_add_epi16(v25, v26); + u26 = _mm_sub_epi16(v25, v26); + u27 = _mm_sub_epi16(v24, v27); + + u28 = _mm_sub_epi16(v31, v28); + u29 = _mm_sub_epi16(v30, v29); + u30 = _mm_add_epi16(v29, v30); + u31 = _mm_add_epi16(v28, v31); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(u16, u23); + out[1] = _mm_add_epi16(u17, u22); + out[2] = _mm_add_epi16(u18, u21); + out[3] = _mm_add_epi16(u19, u20); + out[4] = _mm_sub_epi16(u19, u20); + out[5] = _mm_sub_epi16(u18, u21); + out[6] = _mm_sub_epi16(u17, u22); + out[7] = _mm_sub_epi16(u16, u23); + + out[8] = _mm_sub_epi16(u31, u24); + out[9] = _mm_sub_epi16(u30, u25); + out[10] = _mm_sub_epi16(u29, u26); + out[11] = _mm_sub_epi16(u28, u27); + out[12] = _mm_add_epi16(u27, u28); + out[13] = _mm_add_epi16(u26, u29); + out[14] = _mm_add_epi16(u25, u30); + out[15] = _mm_add_epi16(u24, u31); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); + butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); + butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); + butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); + } +} + +static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[32]*/) { + __m128i temp[16]; + idct32_full_8x32_quarter_1(in, temp); + idct32_full_8x32_quarter_2(in, &temp[8]); + add_sub_butterfly(temp, out, 16); +} + +static void idct32_full_8x32(const __m128i *in /*in[32]*/, + __m128i *out /*out[32]*/) { + __m128i temp[32]; + idct32_full_8x32_quarter_1_2(in, temp); + idct32_full_8x32_quarter_3_4(in, &temp[16]); + add_sub_butterfly(temp, out, 32); +} + +static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { + int i; + for (i = 0; i < 8; ++i) { + in[i] = load_input_data(input); + in[i + 8] = load_input_data(input + 8); + in[i + 16] = load_input_data(input + 16); + in[i + 24] = load_input_data(input + 24); + input += 32; + } +} + +void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[128], in[32]; + int i, j; + + // rows + for (i = 0; i < 4; ++i) { + load_buffer_8x32(input, in); + input += 32 << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + idct32_full_8x32(in, col + (i << 5)); + } + + // columns + for (i = 0; i < 4; ++i) { + j = i << 3; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + idct32_full_8x32(in, in); + store_buffer_8x32(in, dest, stride); + dest += 8; + } +} diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm new file mode 100644 index 000000000..f0668e6f3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm @@ -0,0 +1,112 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movd m%3, [outputq] + movd m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride +%if CONFIG_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] +%else + mova m0, [inputq + 0] + mova m1, [inputq + 16] +%endif + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c new file mode 100644 index 000000000..bf8150e2a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c @@ -0,0 +1,915 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include /* AVX2 */ + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); + const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); + const __m128i blimit = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (aom_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 +}; + +void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, + p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; + + const __m128i thresh = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); + const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); + const __m128i blimit = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + + p256_4 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); + p256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); + p256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); + p256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); + p256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); + q256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); + q256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); + q256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); + q256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); + q256_4 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); + + p4 = _mm256_castsi256_si128(p256_4); + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + q4 = _mm256_castsi256_si128(q256_4); + + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, + flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (aom_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); + q256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); + p5 = _mm256_castsi256_si128(p256_5); + q5 = _mm256_castsi256_si128(q256_5); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); + q256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); + p6 = _mm256_castsi256_si128(p256_6); + q6 = _mm256_castsi256_si128(q256_6); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); + q256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); + p7 = _mm256_castsi256_si128(p256_7); + q7 = _mm256_castsi256_si128(q256_7); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + const __m256i filter = + _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); + p256_7 = _mm256_shuffle_epi8(p256_7, filter); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + q256_7 = _mm256_shuffle_epi8(q256_7, filter); + + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); + + pixetFilter_p2p1p0 = + _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = + _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + + pixelFilter_p = _mm256_add_epi16( + eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + + pixetFilter_p2p1p0 = _mm256_add_epi16( + four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); + + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); + + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), + 3); + + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), + 3); + + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(p256_7, p256_7); + + sum_q7 = _mm256_add_epi16(q256_7, q256_7); + + sum_p3 = _mm256_add_epi16(p256_3, p256_3); + + sum_q3 = _mm256_add_epi16(q256_3, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); + + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); + + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), + 3); + + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), + 3); + + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); + + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); + + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), + 3); + + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), + 3); + + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); + + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); + + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); + + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); + + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); + + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); + + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); + + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); + + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + } + _mm256_zeroupper(); +} diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c new file mode 100644 index 000000000..7e134dc63 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -0,0 +1,1892 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +#if CONFIG_PARALLEL_DEBLOCKING +// filter_mask and hev_mask +#define FILTER_HEV_MASK4 \ + do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = \ + _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask2(*limit, *blimit, */ \ + /* p1, p0, q0, q1); */ \ + abs_p0q0 = \ + _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ + abs_p1q1 = \ + _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ + } while (0) +#endif // CONFIG_PARALLEL_DEBLOCKING + +// filter_mask and hev_mask +#define FILTER_HEV_MASK \ + do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1, work; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = \ + _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask(*limit, *blimit, */ \ + /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ + abs_p0q0 = \ + _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ + abs_p1q1 = \ + _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + /* abs(p3 - p2), abs(p2 - p1) */ \ + work = abs_diff(p3p2, p2p1); \ + flat = _mm_max_epu8(work, flat); \ + /* abs(q3 - q2), abs(q2 - q1) */ \ + work = abs_diff(q3q2, q2q1); \ + flat = _mm_max_epu8(work, flat); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ + } while (0) + +#define FILTER4 \ + do { \ + const __m128i t3t4 = \ + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ + const __m128i t80 = _mm_set1_epi8(0x80); \ + __m128i filter, filter2filter1, work; \ + \ + ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ + qs1qs0 = _mm_xor_si128(q1q0, t80); \ + \ + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ + work = _mm_subs_epi8(ps1ps0, qs1qs0); \ + filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ + filter = _mm_and_si128(filter, mask); /* & mask */ \ + filter = _mm_unpacklo_epi64(filter, filter); \ + \ + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ + \ + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ + filter = _mm_unpacklo_epi8(filter, filter); \ + filter = _mm_srai_epi16(filter, 9); /* round */ \ + filter = _mm_packs_epi16(filter, filter); \ + filter = _mm_andnot_si128(hev, filter); \ + \ + hev = _mm_unpackhi_epi64(filter2filter1, filter); \ + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ + \ + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ + qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ + ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ + qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ + ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ + } while (0) + +void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i p3p2, p2p1, q3q2, q2q1; +#endif // !CONFIG_PARALLEL_DEBLOCKING + __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; +#if !CONFIG_PARALLEL_DEBLOCKING + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s - 4 * p))); +#endif // !CONFIG_PARALLEL_DEBLOCKING + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s + 0 * p))); +#if !CONFIG_PARALLEL_DEBLOCKING + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); +#endif // !CONFIG_PARALLEL_DEBLOCKING + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); +#if !CONFIG_PARALLEL_DEBLOCKING + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); +#endif // !CONFIG_PARALLEL_DEBLOCKING +#if !CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK; +#else // CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK4; +#endif // !CONFIG_PARALLEL_DEBLOCKING + FILTER4; + + _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +} + +void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i p3p2, p2p1, q3q2, q2q1; +#endif // !CONFIG_PARALLEL_DEBLOCKING + __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + + // Transpose 8x8 + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + p1p0 = _mm_unpacklo_epi16(q1q0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x0 = _mm_unpacklo_epi16(x2, x3); +#if !CONFIG_PARALLEL_DEBLOCKING + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + p3p2 = _mm_unpacklo_epi32(p1p0, x0); +#endif // !CONFIG_PARALLEL_DEBLOCKING + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + p1p0 = _mm_unpackhi_epi32(p1p0, x0); +#if !CONFIG_PARALLEL_DEBLOCKING + p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high +#endif // !CONFIG_PARALLEL_DEBLOCKING + p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + q1q0 = _mm_unpackhi_epi16(q1q0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x2 = _mm_unpackhi_epi16(x2, x3); +#if !CONFIG_PARALLEL_DEBLOCKING + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + q3q2 = _mm_unpackhi_epi32(q1q0, x2); +#endif // !CONFIG_PARALLEL_DEBLOCKING + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + q1q0 = _mm_unpacklo_epi32(q1q0, x2); + + q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); + q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); +#if !CONFIG_PARALLEL_DEBLOCKING + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); +#endif // !CONFIG_PARALLEL_DEBLOCKING +#if !CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK; +#else // CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK4; +#endif // !CONFIG_PARALLEL_DEBLOCKING + FILTER4; + + // Transpose 8x4 to 4x8 + // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 + // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); + // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 + x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); + // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); + + *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + + *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +} + +void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + // Filter1 >> 3 + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +static INLINE __m128i filter_add2_sub2(const __m128i *const total, + const __m128i *const a1, + const __m128i *const a2, + const __m128i *const s1, + const __m128i *const s2) { + __m128i x = _mm_add_epi16(*a1, *total); + x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); + return x; +} + +static INLINE __m128i filter8_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f8_lo, + const __m128i *const f8_hi) { + const __m128i f8 = + _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); + const __m128i result = _mm_and_si128(*flat, f8); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +static INLINE __m128i filter16_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f_lo, + const __m128i *const f_hi) { + const __m128i f = + _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); + const __m128i result = _mm_and_si128(*flat, f); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + + __m128i op2, op1, op0, oq0, oq1, oq2; + + __m128i max_abs_p1p0q1q0; + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + // loopfilter done + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + { + const __m128i four = _mm_set1_epi16(4); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), + _mm_add_epi16(p3_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), + _mm_add_epi16(p3_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); + oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); + const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); + const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); + const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); + const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); + const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); + const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); + + const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); + const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); + const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); + const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); + const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); + const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); + const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); + + __m128i f_lo; + __m128i f_hi; + + f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 + f_lo = + _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); + f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); + + f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 + f_hi = + _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); + f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); + + p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + + f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); + p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); + p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); + p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); + op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + + f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); + op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); + op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); + oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); + oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); + oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); + q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); + q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); + q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); + q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + } +} + +void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + // filter_mask and hev_mask + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i ps1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); + + // Filter2 >> 3 + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } +} + +void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + + // filter_mask and hev_mask + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + + do { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < 2); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } +} + +void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + const __m128i zero = _mm_set1_epi16(0); +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i p3, p2, q2, q3; +#endif // !CONFIG_PARALLEL_DEBLOCKING + __m128i p1, p0, q0, q1; + __m128i mask, hev, flat; +#if !CONFIG_PARALLEL_DEBLOCKING + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); +#endif // !CONFIG_PARALLEL_DEBLOCKING + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); +#if !CONFIG_PARALLEL_DEBLOCKING + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); +#endif // !CONFIG_PARALLEL_DEBLOCKING + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i work; +#endif // !CONFIG_PARALLEL_DEBLOCKING + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); +#if !CONFIG_PARALLEL_DEBLOCKING + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); +#endif // !CONFIG_PARALLEL_DEBLOCKING + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + } +} + +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + // 2-way interleave w/hoisting of unpacks + x0 = _mm_loadl_epi64((__m128i *)in0); // 1 + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 + x0 = _mm_unpacklo_epi8(x0, x1); // 1 + + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7 + x1 = _mm_unpacklo_epi8(x2, x3); // 2 + + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9 + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11 + x2 = _mm_unpacklo_epi8(x4, x5); // 3 + + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13 + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15 + x3 = _mm_unpacklo_epi8(x6, x7); // 4 + x4 = _mm_unpacklo_epi16(x0, x1); // 9 + + x8 = _mm_loadl_epi64((__m128i *)in1); // 2 + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 + x8 = _mm_unpacklo_epi8(x8, x9); // 5 + x5 = _mm_unpacklo_epi16(x2, x3); // 10 + + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8 + x9 = _mm_unpacklo_epi8(x10, x11); // 6 + + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10 + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12 + x10 = _mm_unpacklo_epi8(x12, x13); // 7 + x12 = _mm_unpacklo_epi16(x8, x9); // 11 + + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14 + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16 + x11 = _mm_unpacklo_epi8(x14, x15); // 8 + x13 = _mm_unpacklo_epi16(x10, x11); // 12 + + x6 = _mm_unpacklo_epi32(x4, x5); // 13 + x7 = _mm_unpackhi_epi32(x4, x5); // 14 + x14 = _mm_unpacklo_epi32(x12, x13); // 15 + x15 = _mm_unpackhi_epi32(x12, x13); // 16 + + // Store first 4-line result + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store second 4-line result + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + +#if CONFIG_PARALLEL_DEBLOCKING +#define movq(p) _mm_loadl_epi64((const __m128i *)(p)) +#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1) +#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1) +#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1) +#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r) +#define pshufd(r, imm) _mm_shuffle_epi32(r, imm) +enum { ROTATE_DWORD_RIGHT = 0x39 }; +static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride, + const uint8_t *pSrc, + const ptrdiff_t srcStride) { + for (uint32_t idx = 0; idx < 2; idx += 1) { + __m128i r0, r1, r2, r3; + // load data + r0 = movq(pSrc); + r1 = movq(pSrc + srcStride); + r2 = movq(pSrc + srcStride * 2); + r3 = movq(pSrc + srcStride * 3); + // transpose + r0 = punpcklbw(r0, r1); + r2 = punpcklbw(r2, r3); + r1 = punpckhwd(r0, r2); + r0 = punpcklwd(r0, r2); + // store data + movd(pDst, r0); + r0 = pshufd(r0, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride, r0); + r0 = pshufd(r0, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 2, r0); + r0 = pshufd(r0, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 3, r0); + movd(pDst + dstStride * 4, r1); + r1 = pshufd(r1, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 5, r1); + r1 = pshufd(r1, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 6, r1); + r1 = pshufd(r1, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 7, r1); + // advance the pointers + pDst += dstStride * 8; + pSrc += 8; + } +} + +#endif // CONFIG_PARALLEL_DEBLOCKING +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = + _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + x1 = + _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + + x2 = + _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + x3 = + _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + + x4 = + _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + x5 = + _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + + x6 = + _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + x7 = + _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + _mm_storel_pd((double *)(out + 0 * out_p), + _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 + _mm_storeh_pd((double *)(out + 1 * out_p), + _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + _mm_storel_pd((double *)(out + 2 * out_p), + _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 + _mm_storeh_pd((double *)(out + 3 * out_p), + _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + _mm_storel_pd((double *)(out + 4 * out_p), + _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 + _mm_storeh_pd((double *)(out + 5 * out_p), + _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 6 * out_p), + _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 + _mm_storeh_pd((double *)(out + 7 * out_p), + _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); +#if !CONFIG_PARALLEL_DEBLOCKING + unsigned char *src[2]; + unsigned char *dst[2]; +#endif // !CONFIG_PARALLEL_DEBLOCKING + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); +#if !CONFIG_PARALLEL_DEBLOCKING + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +#else // CONFIG_PARALLEL_DEBLOCKING + transpose16x4(s - 2, p, t_dst + 16 * 2, 16); +#endif // !CONFIG_PARALLEL_DEBLOCKING +} + +void aom_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); + unsigned char *src[1]; + unsigned char *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + transpose(src, p, dst, 8, 1); + + // Loop filtering + aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + transpose(src, 8, dst, p, 1); +} + +void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +} + +void aom_lpf_vertical_16_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); + unsigned char *src[2]; + unsigned char *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + transpose(src, p, dst, 8, 2); + + // Loop filtering + aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + transpose(src, 8, dst, p, 2); +} + +void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED(16, unsigned char, t_dst[256]); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c new file mode 100644 index 000000000..5166e9e0a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_ports/mem.h" +#include "./aom_config.h" +#include "aom/aom_integer.h" + +static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr); + __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride)); + __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); + temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2)); + temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3)); + temp1 = _mm_unpacklo_epi32(temp1, temp2); + return _mm_unpacklo_epi64(temp3, temp1); +} + +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height); + +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ + m, n); \ + } + +#if CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +#endif // CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) + +#define MASKSAD8XN_SSSE3(n) \ + unsigned int aom_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) + +#define MASKSAD4XN_SSSE3(n) \ + unsigned int aom_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) + +// For width a multiple of 16 +// Assumes values in m are <=64 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 16) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_loadu_si128((const __m128i *)(m_ptr + x)); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + temp2 = _mm_maddubs_epi16(temp1, m); + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2, row_res; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = width8_load_2rows(a_ptr, a_stride); + b = width8_load_2rows(b_ptr, b_stride); + m = width8_load_2rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + row_res = _mm_maddubs_epi16(temp1, m); + + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2, row_res; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 4 rows at a time + for (y = 0; y < height; y += 4) { + // Load a, b, m in xmm registers + a = width4_load_4rows(a_ptr, a_stride); + b = width4_load_4rows(b_ptr, b_stride); + m = width4_load_4rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + row_res = _mm_maddubs_epi16(temp1, m); + + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + + // Move onto the next rows + a_ptr += a_stride * 4; + b_ptr += b_stride * 4; + m_ptr += m_stride * 4; + } + // Pad out row result to 32 bit integers & add to running total + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +#if CONFIG_HIGHBITDEPTH +static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, + int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height); + +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } + +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(128, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 128) +#endif // CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(64, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 64) +HIGHBD_MASKSADMXN_SSSE3(32, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 32) +HIGHBD_MASKSADMXN_SSSE3(16, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 16) +HIGHBD_MASKSADMXN_SSSE3(8, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 4) + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +HIGHBD_MASKSAD4XN_SSSE3(8) +HIGHBD_MASKSAD4XN_SSSE3(4) + +// For width a multiple of 8 +// Assumes values in m are <=64 +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 8) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Add result of multiplying by m and add pairs together to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = highbd_width4_load_2rows(a_ptr, a_stride); + b = highbd_width4_load_2rows(b_ptr, b_stride); + temp1 = _mm_loadl_epi64((const __m128i *)m_ptr); + temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)); + m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c new file mode 100644 index 000000000..fe14597f6 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -0,0 +1,1948 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_filter.h" + +// Half pixel shift +#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2) + +/***************************************************************************** + * Horizontal additions + *****************************************************************************/ + +static INLINE int32_t hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +#if CONFIG_HIGHBITDEPTH +static INLINE int64_t hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} +#endif // CONFIG_HIGHBITDEPTH + +static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q, + uint32_t *sse, int w, int h) { + int64_t sum64; + uint64_t sse64; + + // Horizontal sum + sum64 = hsum_epi32_si32(v_sum_d); + sse64 = hsum_epi64_si64(v_sse_q); + + sum64 = (sum64 >= 0) ? sum64 : -sum64; + + // Round + sum64 = ROUND_POWER_OF_TWO(sum64, 6); + sse64 = ROUND_POWER_OF_TWO(sse64, 12); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute the variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +/***************************************************************************** + * n*16 Wide versions + *****************************************************************************/ + +static INLINE unsigned int masked_variancewxh_ssse3( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + int ii, jj; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((w % 16) == 0); + + for (ii = 0; ii < h; ii++) { + for (jj = 0; jj < w; jj += 16) { + // Load inputs - 8 bits + const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj)); + const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj)); + const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj)); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero); + const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero); + const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w); + const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w); + const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w); + const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w); + const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w); + + // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits + const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); + } + + // Move on to next row + a += a_stride; + b += b_stride; + m += m_stride; + } + + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} + +#define MASKED_VARWXH(W, H) \ + unsigned int aom_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \ + H, sse); \ + } + +MASKED_VARWXH(16, 8) +MASKED_VARWXH(16, 16) +MASKED_VARWXH(16, 32) +MASKED_VARWXH(32, 16) +MASKED_VARWXH(32, 32) +MASKED_VARWXH(32, 64) +MASKED_VARWXH(64, 32) +MASKED_VARWXH(64, 64) +#if CONFIG_EXT_PARTITION +MASKED_VARWXH(64, 128) +MASKED_VARWXH(128, 64) +MASKED_VARWXH(128, 128) +#endif // CONFIG_EXT_PARTITION + +/***************************************************************************** + * 8 Wide versions + *****************************************************************************/ + +static INLINE unsigned int masked_variance8xh_ssse3( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, unsigned int *sse) { + int ii; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + for (ii = 0; ii < h; ii++) { + // Load inputs - 8 bits + const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w); + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); + + // Move on to next row + a += a_stride; + b += b_stride; + m += m_stride; + } + + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +#define MASKED_VAR8XH(H) \ + unsigned int aom_masked_variance8x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ + sse); \ + } + +MASKED_VAR8XH(4) +MASKED_VAR8XH(8) +MASKED_VAR8XH(16) + +/***************************************************************************** + * 4 Wide versions + *****************************************************************************/ + +static INLINE unsigned int masked_variance4xh_ssse3( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, unsigned int *sse) { + int ii; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((h % 2) == 0); + + for (ii = 0; ii < h / 2; ii++) { + // Load 2 input rows - 8 bits + const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a); + const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b); + const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); + const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride)); + const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride)); + const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); + + // Interleave 2 rows into a single register + const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b); + const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b); + const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w); + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); + + // Move on to next 2 row + a += a_stride * 2; + b += b_stride * 2; + m += m_stride * 2; + } + + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +#define MASKED_VAR4XH(H) \ + unsigned int aom_masked_variance4x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ + sse); \ + } + +MASKED_VAR4XH(4) +MASKED_VAR4XH(8) + +#if CONFIG_HIGHBITDEPTH + +// Main calculation for n*8 wide blocks +static INLINE void highbd_masked_variance64_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) { + int ii, jj; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((w % 8) == 0); + + for (ii = 0; ii < h; ii++) { + for (jj = 0; jj < w; jj += 8) { + // Load inputs - 8 bits + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj)); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj)); + const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj)); + + // Unpack m to 16 bits - still containing max 8 bits + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-4095, 4095] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) + const __m128i v_absd_w = _mm_abs_epi16(v_d_w); + const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); + const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); + const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); + const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); + const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); + const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); + // Square and sum the errors -> 36bits * 4 = 38bits + __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; + v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); + v_elo1_d = _mm_srli_si128(v_elo_d, 4); + v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); + v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); + v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); + v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); + v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); + v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); + v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_q); + } + + // Move on to next row + a += a_stride; + b += b_stride; + m += m_stride; + } + + // Horizontal sum + *sum = hsum_epi32_si64(v_sum_d); + *sse = hsum_epi64_si64(v_sse_q); + + // Round + *sum = (*sum >= 0) ? *sum : -*sum; + *sum = ROUND_POWER_OF_TWO(*sum, 6); + *sse = ROUND_POWER_OF_TWO(*sse, 12); +} + +// Main calculation for 4 wide blocks +static INLINE void highbd_masked_variance64_4wide_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) { + int ii; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((h % 2) == 0); + + for (ii = 0; ii < h / 2; ii++) { + // Load 2 input rows - 8 bits + const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); + const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); + + // Interleave 2 rows into a single register + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w); + const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-4095, 4095] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit) + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) + const __m128i v_absd_w = _mm_abs_epi16(v_d_w); + const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); + const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); + const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); + const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); + const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); + const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); + // Square and sum the errors -> 36bits * 4 = 38bits + __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; + v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); + v_elo1_d = _mm_srli_si128(v_elo_d, 4); + v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); + v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); + v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); + v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); + v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); + v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); + v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_q); + + // Move on to next row + a += a_stride * 2; + b += b_stride * 2; + m += m_stride * 2; + } + + // Horizontal sum + *sum = hsum_epi32_si32(v_sum_d); + *sse = hsum_epi64_si64(v_sse_q); + + // Round + *sum = (*sum >= 0) ? *sum : -*sum; + *sum = ROUND_POWER_OF_TWO(*sum, 6); + *sse = ROUND_POWER_OF_TWO(*sse, 12); +} + +static INLINE unsigned int highbd_masked_variancewxh_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + uint64_t sse64; + int64_t sum64; + + if (w == 4) + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); + else + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute and return variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +static INLINE unsigned int highbd_10_masked_variancewxh_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + uint64_t sse64; + int64_t sum64; + + if (w == 4) + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); + else + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); + + // Normalise + sum64 = ROUND_POWER_OF_TWO(sum64, 2); + sse64 = ROUND_POWER_OF_TWO(sse64, 4); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute and return variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +static INLINE unsigned int highbd_12_masked_variancewxh_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + uint64_t sse64; + int64_t sum64; + + if (w == 4) + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); + else + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); + + sum64 = ROUND_POWER_OF_TWO(sum64, 4); + sse64 = ROUND_POWER_OF_TWO(sse64, 8); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute and return variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +#define HIGHBD_MASKED_VARWXH(W, H) \ + unsigned int aom_highbd_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } \ + \ + unsigned int aom_highbd_10_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } \ + \ + unsigned int aom_highbd_12_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } + +HIGHBD_MASKED_VARWXH(4, 4) +HIGHBD_MASKED_VARWXH(4, 8) +HIGHBD_MASKED_VARWXH(8, 4) +HIGHBD_MASKED_VARWXH(8, 8) +HIGHBD_MASKED_VARWXH(8, 16) +HIGHBD_MASKED_VARWXH(16, 8) +HIGHBD_MASKED_VARWXH(16, 16) +HIGHBD_MASKED_VARWXH(16, 32) +HIGHBD_MASKED_VARWXH(32, 16) +HIGHBD_MASKED_VARWXH(32, 32) +HIGHBD_MASKED_VARWXH(32, 64) +HIGHBD_MASKED_VARWXH(64, 32) +HIGHBD_MASKED_VARWXH(64, 64) +#if CONFIG_EXT_PARTITION +HIGHBD_MASKED_VARWXH(64, 128) +HIGHBD_MASKED_VARWXH(128, 64) +HIGHBD_MASKED_VARWXH(128, 128) +#endif // CONFIG_EXT_PARTITION + +#endif + +////////////////////////////////////////////////////////////////////////////// +// Sub pixel versions +////////////////////////////////////////////////////////////////////////////// + +typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b, + __m128i v_filter_b); + +static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b, + const __m128i v_filter_b) { + (void)v_filter_b; + return _mm_avg_epu8(v_a_b, v_b_b); +} + +static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b, + const __m128i v_filter_b) { + const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1)); + __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b); + __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b); + __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b); + __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b); + __m128i v_res_lo_w = + _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); + __m128i v_res_hi_w = + _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS); + return _mm_packus_epi16(v_res_lo_w, v_res_hi_w); +} + +// Apply the filter to the contents of the lower half of a and b +static INLINE void apply_filter_lo(const __m128i v_a_lo_b, + const __m128i v_b_lo_b, + const __m128i v_filter_b, __m128i *v_res_w) { + const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1)); + __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b); + __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b); + *v_res_w = + _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); +} + +static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b, + const __m128i v_m_b, __m128i *v_sum_d, + __m128i *v_sse_q) { + const __m128i v_zero = _mm_setzero_si128(); + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero); + const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero); + const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w); + const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w); + const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w); + const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w); + const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w); + + // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits + const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d); + *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d); + *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q); + *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q); +} + +// Functions for width (W) >= 16 +unsigned int aom_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int w, int h, + filter_fn_t filter_fn) { + int i, j; + __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_b = _mm_set1_epi16( + (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 16) { + // Load the first row ready + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row apply the filter + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); + v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row apply the filter + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); + v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride)); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} +unsigned int aom_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int w, int h, + filter_fn_t filter_fn) { + int i, j; + __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_b = _mm_set1_epi16( + (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j += 16) { + // Load this row and one below & apply the filter to them + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); + + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + } + src += src_stride; + dst += dst_stride; + msk += msk_stride; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} +unsigned int aom_masked_subpel_varWxH_xnonzero_ynonzero( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int w, int h, filter_fn_t xfilter_fn, + filter_fn_t yfilter_fn) { + int i, j; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b; + __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filterx_b = _mm_set1_epi16( + (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); + const __m128i v_filtery_b = _mm_set1_epi16( + (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 16) { + // Load the first row ready + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row & apply the filter + v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); + v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); + v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); + // Complete the calculation for this row and add it to the running total + v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row & apply the filter + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); + v_src1_b = + _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); + v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j)); + // Complete the calculation for this row and add it to the running total + v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} + +// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, +// xmm[63:32] = row 3, xmm[31:0] = row 4 +unsigned int aom_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w; + __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b; + __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first row of src data ready + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + for (i = 0; i < h; i += 4) { + // Load the rest of the source data for these rows + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); + v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); + // Load the dst data + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); + v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); + v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); + v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); + // Load the mask data + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); + v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); + v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); + v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); + // Apply the y filter + if (yoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b); + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src1_b, 4), + _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); + v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b); + } else { + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src1_b, 4), + _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0))); + apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w); + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src3_b, 4), + _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); + apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w); + v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w); + } + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 +unsigned int aom_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b; + __m128i v_dst_b = _mm_setzero_si128(); + __m128i v_msk_b = _mm_setzero_si128(); + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first row of src data ready + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + for (i = 0; i < h; i += 2) { + if (yoffset == HALF_PIXEL_OFFSET) { + // Load the rest of the source data for these rows + v_src1_b = _mm_or_si128( + _mm_slli_si128(v_src0_b, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); + v_src0_b = _mm_or_si128( + _mm_slli_si128(v_src1_b, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); + // Apply the y filter + v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b); + } else { + // Load the data and apply the y filter + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w); + v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w); + v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, +// xmm[63:32] = row 3, xmm[31:0] = row 4 +unsigned int aom_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; + __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; + __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b; + __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 4) { + // Load the src data + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); + v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); + v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); + v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); + // Load the dst data + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); + v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); + v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); + v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); + // Load the mask data + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); + v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); + v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); + v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b); + v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b); + v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w); + apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w); + v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w); + } + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w; + __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 2) { + // Load the src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w); + v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, +// xmm[63:32] = row 3, xmm[31:0] = row 4 +unsigned int aom_masked_subpel_var4xH_xnonzero_ynonzero( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h) { + int i; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; + __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; + __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b; + __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b; + __m128i v_xres_b[2]; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 4) { + // Load the src data + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); + v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); + v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); + v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b); + v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b); + v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w); + v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w); + } + // Move onto the next set of rows + src += src_stride * 4; + } + // Load one more row to be used in the y filter + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b), + _mm_setr_epi32(-1, 0, 0, 0)); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + v_extra_row_b = + _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()), + _mm_setr_epi32(-1, 0, 0, 0)); + } + + for (i = 0; i < h; i += 4) { + if (h == 8 && i == 0) { + v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4), + _mm_srli_si128(v_xres_b[1], 12)); + } else { + v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4), + v_extra_row_b); + } + // Apply the y filter + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b); + } else { + v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b); + } + + // Load the dst data + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); + v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); + v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); + v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); + // Load the mask data + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); + v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); + v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); + v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_masked_subpel_var8xH_xnonzero_ynonzero( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h) { + int i; + __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b; + __m128i v_src0_shift_b, v_src1_shift_b; + __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first block of src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); + v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + for (i = 0; i < h; i += 4) { + // Load the next block of src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); + v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + // Apply the y filter to the previous block + v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8), + _mm_slli_si128(v_xres1_b, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b); + } else { + v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next block of src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); + v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + // Apply the y filter to the previous block + v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8), + _mm_slli_si128(v_xres0_b, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b); + } else { + v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +// For W >=16 +#define MASK_SUBPIX_VAR_LARGE(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + assert(W % 16 == 0); \ + if (xoffset == 0) { \ + if (yoffset == 0) \ + return aom_masked_variance##W##x##H##_ssse3( \ + src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ + else if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_xzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst, \ + dst_stride, msk, msk_stride, \ + sse, W, H, apply_filter); \ + } else if (yoffset == 0) { \ + if (xoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_yzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst, \ + dst_stride, msk, msk_stride, \ + sse, W, H, apply_filter); \ + } else if (xoffset == HALF_PIXEL_OFFSET) { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ + dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg, \ + apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg, apply_filter); \ + } else { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter, apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter, apply_filter); \ + } \ + } + +// For W < 16 +#define MASK_SUBPIX_VAR_SMALL(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + assert(W == 4 || W == 8); \ + if (xoffset == 0 && yoffset == 0) \ + return aom_masked_variance##W##x##H##_ssse3( \ + src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ + else if (xoffset == 0) \ + return aom_masked_subpel_var##W##xH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \ + else if (yoffset == 0) \ + return aom_masked_subpel_var##W##xH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \ + else \ + return aom_masked_subpel_var##W##xH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ + sse, H); \ + } + +MASK_SUBPIX_VAR_SMALL(4, 4) +MASK_SUBPIX_VAR_SMALL(4, 8) +MASK_SUBPIX_VAR_SMALL(8, 4) +MASK_SUBPIX_VAR_SMALL(8, 8) +MASK_SUBPIX_VAR_SMALL(8, 16) +MASK_SUBPIX_VAR_LARGE(16, 8) +MASK_SUBPIX_VAR_LARGE(16, 16) +MASK_SUBPIX_VAR_LARGE(16, 32) +MASK_SUBPIX_VAR_LARGE(32, 16) +MASK_SUBPIX_VAR_LARGE(32, 32) +MASK_SUBPIX_VAR_LARGE(32, 64) +MASK_SUBPIX_VAR_LARGE(64, 32) +MASK_SUBPIX_VAR_LARGE(64, 64) +#if CONFIG_EXT_PARTITION +MASK_SUBPIX_VAR_LARGE(64, 128) +MASK_SUBPIX_VAR_LARGE(128, 64) +MASK_SUBPIX_VAR_LARGE(128, 128) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_HIGHBITDEPTH +typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q, + uint32_t *sse, int w, int h); +typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, + unsigned int *sse); +typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w, + __m128i v_filter_w); + +static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w, + const __m128i v_b_w, + const __m128i v_filter_w) { + (void)v_filter_w; + return _mm_avg_epu16(v_a_w, v_b_w); +} + +static INLINE __m128i highbd_apply_filter(const __m128i v_a_w, + const __m128i v_b_w, + const __m128i v_filter_w) { + const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1)); + __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w); + __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w); + __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w); + __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w); + __m128i v_res_lo_d = + _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); + __m128i v_res_hi_d = + _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS); + return _mm_packs_epi32(v_res_lo_d, v_res_hi_d); +} +// Apply the filter to the contents of the lower half of a and b +static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w, + const __m128i v_b_lo_w, + const __m128i v_filter_w, + __m128i *v_res_d) { + const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1)); + __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w); + __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w); + *v_res_d = + _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); +} + +static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w, + const __m128i v_m_b, __m128i *v_sum_d, + __m128i *v_sse_q) { + const __m128i v_zero = _mm_setzero_si128(); + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-2^12, 2^12] => 13 bits (incld sign bit) + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) + const __m128i v_absd_w = _mm_abs_epi16(v_d_w); + const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); + const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); + const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); + const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); + const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); + const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); + // Square and sum the errors -> 36bits * 4 = 38bits + __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; + v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); + v_elo1_d = _mm_srli_si128(v_elo_d, 4); + v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); + v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); + v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); + v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); + v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); + v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); + v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); + + // Accumulate + *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d); + *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q); +} + +static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d, + __m128i v_sse_q, + uint32_t *sse, int w, + int h) { + int64_t sum64; + uint64_t sse64; + + // Horizontal sum + sum64 = hsum_epi32_si32(v_sum_d); + sse64 = hsum_epi64_si64(v_sse_q); + + sum64 = (sum64 >= 0) ? sum64 : -sum64; + + // Round + sum64 = ROUND_POWER_OF_TWO(sum64, 6); + sse64 = ROUND_POWER_OF_TWO(sse64, 12); + + // Normalise + sum64 = ROUND_POWER_OF_TWO(sum64, 2); + sse64 = ROUND_POWER_OF_TWO(sse64, 4); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute the variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} +static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d, + __m128i v_sse_q, + uint32_t *sse, int w, + int h) { + int64_t sum64; + uint64_t sse64; + + // Horizontal sum + sum64 = hsum_epi32_si64(v_sum_d); + sse64 = hsum_epi64_si64(v_sse_q); + + sum64 = (sum64 >= 0) ? sum64 : -sum64; + + // Round + sum64 = ROUND_POWER_OF_TWO(sum64, 6); + sse64 = ROUND_POWER_OF_TWO(sse64, 12); + + // Normalise + sum64 = ROUND_POWER_OF_TWO(sum64, 4); + sse64 = ROUND_POWER_OF_TWO(sse64, 8); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute the variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +// High bit depth functions for width (W) >= 8 +unsigned int aom_highbd_masked_subpel_varWxH_xzero( + const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int w, int h, highbd_filter_fn_t filter_fn, + highbd_calc_masked_var_t calc_var) { + int i, j; + __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_w = + _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 8) { + // Load the first row ready + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row apply the filter + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); + v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row apply the filter + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); + v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride)); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_var(v_sum_d, v_sse_q, sse, w, h); +} +unsigned int aom_highbd_masked_subpel_varWxH_yzero( + const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int w, int h, highbd_filter_fn_t filter_fn, + highbd_calc_masked_var_t calc_var) { + int i, j; + __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_w = + _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j += 8) { + // Load this row & apply the filter to them + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); + + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + } + src += src_stride; + dst += dst_stride; + msk += msk_stride; + } + return calc_var(v_sum_d, v_sse_q, sse, w, h); +} + +unsigned int aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( + const uint16_t *src, int src_stride, int xoffset, int yoffset, + const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn, + highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) { + int i, j; + __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w; + __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filterx_w = + _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + const __m128i v_filtery_w = + _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 8) { + // Load the first row ready + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row & apply the filter + v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); + v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); + v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); + // Complete the calculation for this row and add it to the running total + v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row & apply the filter + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); + v_src1_w = + _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); + v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j)); + // Complete the calculation for this row and add it to the running total + v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_var(v_sum_d, v_sse_q, sse, w, h); +} + +// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 +unsigned int aom_highbd_masked_subpel_var4xH_xzero( + const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int h, highbd_calc_masked_var_t calc_var) { + int i; + __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w; + __m128i v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first row of src data ready + v_src0_w = _mm_loadl_epi64((const __m128i *)src); + for (i = 0; i < h; i += 2) { + if (yoffset == HALF_PIXEL_OFFSET) { + // Load the rest of the source data for these rows + v_src1_w = _mm_or_si128( + _mm_slli_si128(v_src0_w, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); + v_src0_w = _mm_or_si128( + _mm_slli_si128(v_src1_w, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); + // Apply the y filter + v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w); + } else { + // Load the data and apply the y filter + v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d); + v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d); + v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_var(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_highbd_masked_subpel_var4xH_yzero( + const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int h, highbd_calc_masked_var_t calc_var) { + int i; + __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d; + __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 2) { + // Load the src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w, + &v_filtered1_d); + v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_var(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( + const uint16_t *src, int src_stride, int xoffset, int yoffset, + const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) { + int i; + __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b; + __m128i v_src0_shift_w, v_src1_shift_w; + __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first block of src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, + &v_filtered1_d); + v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + for (i = 0; i < h; i += 4) { + // Load the next block of src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, + &v_filtered1_d); + v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + // Apply the y filter to the previous block + v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8), + _mm_slli_si128(v_xres1_w, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w); + } else { + v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next block of src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, + &v_filtered1_d); + v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + // Apply the y filter to the previous block + v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8), + _mm_slli_si128(v_xres0_w, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w); + } else { + v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_var(v_sum_d, v_sse_q, sse, 4, h); +} + +// For W >=8 +#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H) \ + unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse, highbd_calc_masked_var_t calc_var, \ + highbd_variance_fn_t full_variance_function) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + assert(W % 8 == 0); \ + if (xoffset == 0) { \ + if (yoffset == 0) \ + return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ + msk_stride, sse); \ + else if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_xzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, \ + W, H, highbd_apply_filter, calc_var); \ + } else if (yoffset == 0) { \ + if (xoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_yzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, \ + W, H, highbd_apply_filter, calc_var); \ + } else if (xoffset == HALF_PIXEL_OFFSET) { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ + dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg, \ + highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, \ + highbd_apply_filter, calc_var); \ + } else { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter, \ + highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter, \ + calc_var); \ + } \ + } + +// For W < 8 +#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H) \ + unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse, highbd_calc_masked_var_t calc_var, \ + highbd_variance_fn_t full_variance_function) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + assert(W == 4); \ + if (xoffset == 0 && yoffset == 0) \ + return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ + msk_stride, sse); \ + else if (xoffset == 0) \ + return aom_highbd_masked_subpel_var4xH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H, \ + calc_var); \ + else if (yoffset == 0) \ + return aom_highbd_masked_subpel_var4xH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H, \ + calc_var); \ + else \ + return aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ + sse, H, calc_var); \ + } + +#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H) \ + unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, calc_masked_variance, \ + aom_highbd_masked_variance##W##x##H##_ssse3); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, highbd_10_calc_masked_variance, \ + aom_highbd_10_masked_variance##W##x##H##_ssse3); \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, highbd_12_calc_masked_variance, \ + aom_highbd_12_masked_variance##W##x##H##_ssse3); \ + } + +HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4) +HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8) +HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4) +HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8) +HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16) +HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8) +HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16) +HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32) +HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16) +HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32) +HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64) +HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32) +HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64) +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128) +HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64) +HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128) +#endif // CONFIG_EXT_PARTITION +#endif diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c new file mode 100644 index 000000000..ad77f974c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + const int height) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, const int width, + const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +#if CONFIG_EXT_PARTITION +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +#if CONFIG_HIGHBITDEPTH +static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +#if CONFIG_EXT_PARTITION +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c new file mode 100644 index 000000000..efb3659cf --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_filter.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#if CONFIG_EXT_PARTITION +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +#if CONFIG_HIGHBITDEPTH +static INLINE void hbd_obmc_variance_w4( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +static INLINE void hbd_obmc_variance_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, + const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum += xx_hsum_epi32_si64(v_sum_d); + *sse += xx_hsum_epi32_si64(v_sse_d); +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 128) { + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128, + 32); + pre8 += 32 * pre_stride; + wsrc += 32 * 128; + mask += 32 * 128; + h -= 32; + } while (h > 0); + } else if (w == 64 && h >= 128) { + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64, + 64); + pre8 += 64 * pre_stride; + wsrc += 64 * 64; + mask += 64 * 64; + h -= 64; + } while (h > 0); + } else if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HBD_OBMCVARWXH(W, H) \ + unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#if CONFIG_EXT_PARTITION +HBD_OBMCVARWXH(128, 128) +HBD_OBMCVARWXH(128, 64) +HBD_OBMCVARWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +HBD_OBMCVARWXH(64, 64) +HBD_OBMCVARWXH(64, 32) +HBD_OBMCVARWXH(32, 64) +HBD_OBMCVARWXH(32, 32) +HBD_OBMCVARWXH(32, 16) +HBD_OBMCVARWXH(16, 32) +HBD_OBMCVARWXH(16, 16) +HBD_OBMCVARWXH(16, 8) +HBD_OBMCVARWXH(8, 16) +HBD_OBMCVARWXH(8, 8) +HBD_OBMCVARWXH(8, 4) +HBD_OBMCVARWXH(4, 8) +HBD_OBMCVARWXH(4, 4) +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm new file mode 100644 index 000000000..954a95b98 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm @@ -0,0 +1,547 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + vzeroupper + + ; If we can skip this block, then just zero the output + cmp skipmp, 0 + jne .blank + +%ifnidn %1, b_32x32 + + ; Special case for ncoeff == 16, as it is frequent and we can save on + ; not setting up a loop. + cmp ncoeffmp, 16 + jne .generic + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Special case of ncoeff == 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.single: + + movifnidn coeffq, coeffmp + movifnidn zbinq, zbinmp + mova m0, [zbinq] ; m0 = zbin + + ; Get DC and first 15 AC coeffs - in this special case, that is all. +%if CONFIG_HIGHBITDEPTH + ; coeff stored as 32bit numbers but we process them as 16 bit numbers + mova m9, [coeffq] + packssdw m9, [coeffq+16] ; m9 = c[i] + mova m10, [coeffq+32] + packssdw m10, [coeffq+48] ; m10 = c[i] +%else + mova m9, [coeffq] ; m9 = c[i] + mova m10, [coeffq+16] ; m10 = c[i] +%endif + + mov r0, eobmp ; Output pointer + mov r1, qcoeffmp ; Output pointer + mov r2, dqcoeffmp ; Output pointer + + pxor m5, m5 ; m5 = dedicated zero + + pcmpeqw m4, m4 ; All word lanes -1 + paddw m0, m4 ; m0 = zbin - 1 + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, we just write zeros + ; to the outputs and we are done. + por m14, m7, m12 + ptest m14, m14 + jnz .single_nonzero + +%if CONFIG_HIGHBITDEPTH + mova [r1 ], ymm5 + mova [r1+32], ymm5 + mova [r2 ], ymm5 + mova [r2+32], ymm5 +%else + mova [r1], ymm5 + mova [r2], ymm5 +%endif + mov [r0], word 0 + + vzeroupper + RET + +.single_nonzero: + + ; Actual quantization of size 16 block - setup pointers, rounders, etc. + movifnidn r4, roundmp + movifnidn r5, quantmp + mov r3, dequantmp + mov r6, shiftmp + mova m1, [r4] ; m1 = round + mova m2, [r5] ; m2 = quant + mova m3, [r3] ; m3 = dequant + mova m4, [r6] ; m4 = shift + + mov r3, iscanmp + + DEFINE_ARGS eob, qcoeff, dqcoeff, iscan + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + +%if CONFIG_HIGHBITDEPTH + ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq ], m11 + mova [qcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+32], m11 + mova [qcoeffq+48], m6 +%else + mova [qcoeffq ], m8 + mova [qcoeffq+16], m13 +%endif + + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q + +%if CONFIG_HIGHBITDEPTH + ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq ], m11 + mova [dqcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+32], m11 + mova [dqcoeffq+48], m6 +%else + mova [dqcoeffq ], m8 + mova [dqcoeffq+16], m13 +%endif + + mova m6, [iscanq] ; m6 = scan[i] + mova m11, [iscanq+16] ; m11 = scan[i] + + pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 + psubw m6, m6, m7 ; m6 = scan[i] + 1 + psubw m11, m11, m12 ; m11 = scan[i] + 1 + pandn m8, m8, m6 ; m8 = max(eob) + pandn m13, m13, m11 ; m13 = max(eob) + pmaxsw m8, m8, m13 + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [eobq], ax + + vzeroupper + RET + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Generic case of ncoeff != 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.generic: + +%endif ; %ifnidn %1, b_32x32 + +DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ + qcoeff, dqcoeff, dequant, eob, scan, iscan + + ; Actual quantization loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant + mova m3, [r2] ; m3 = dequant + pcmpeqw m4, m4 ; All lanes -1 +%ifidn %1, b_32x32 + psubw m0, m4 + psubw m1, m4 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + paddw m0, m4 ; m0 = m0 + 1 + + mov r2, shiftmp + mov r3, qcoeffmp + mova m4, [r2] ; m4 = shift + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + +%if CONFIG_HIGHBITDEPTH + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] +%else + lea coeffq, [ coeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] +%endif + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs +%if CONFIG_HIGHBITDEPTH + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] +%else + mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip forward quickly. + por m14, m7, m12 + ptest m14, m14 + jnz .first_nonzero + +%if CONFIG_HIGHBITDEPTH + mova [qcoeffq+ncoeffq*4 ], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4 ], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 +%else + mova [qcoeffq+ncoeffq*2], ymm5 + mova [dqcoeffq+ncoeffq*2], ymm5 +%endif + + add ncoeffq, mmsize + + punpckhqdq m1, m1 + punpckhqdq m2, m2 + punpckhqdq m3, m3 + punpckhqdq m4, m4 + pxor m8, m8 + + jmp .ac_only_loop + +.first_nonzero: + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 +%else + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + +.ac_only_loop: + +%if CONFIG_HIGHBITDEPTH + ; pack coeff from 32bit to 16bit array + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] +%else + mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip this itertion. + ; And just write zeros as the result would be. + por m14, m7, m12 + ptest m14, m14 + jnz .rest_nonzero + +%if CONFIG_HIGHBITDEPTH + mova [qcoeffq+ncoeffq*4+ 0], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4+ 0], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 +%else + mova [qcoeffq+ncoeffq*2+ 0], ymm5 + mova [dqcoeffq+ncoeffq*2+ 0], ymm5 +%endif + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET + +.rest_nonzero: + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 +%else + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET + + ; Skip-block, i.e. just write all zeroes +.blank: + +DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ + qcoeff, dqcoeff, dequant, eob, scan, iscan + + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + +DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob + +%if CONFIG_HIGHBITDEPTH + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] +%else + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] +%endif + + neg ncoeffq + pxor m7, m7 + +.blank_loop: +%if CONFIG_HIGHBITDEPTH + mova [dqcoeffq+ncoeffq*4+ 0], ymm7 + mova [dqcoeffq+ncoeffq*4+32], ymm7 + mova [qcoeffq+ncoeffq*4+ 0], ymm7 + mova [qcoeffq+ncoeffq*4+32], ymm7 +%else + mova [dqcoeffq+ncoeffq*2+ 0], ymm7 + mova [qcoeffq+ncoeffq*2+ 0], ymm7 +%endif + add ncoeffq, mmsize + jl .blank_loop + + mov [eobq], word 0 + + vzeroupper + RET +%endmacro + +INIT_XMM avx +QUANTIZE_FN b, 7 +QUANTIZE_FN b_32x32, 7 + +END diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c new file mode 100644 index 000000000..890c1f01e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { +#if CONFIG_HIGHBITDEPTH + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); +#else + return _mm_load_si128((const __m128i *)coeff_ptr); +#endif +} + +static INLINE void store_coefficients(__m128i coeff_vals, + tran_low_t *coeff_ptr) { +#if CONFIG_HIGHBITDEPTH + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); +#else + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); +#endif +} + +void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + __m128i zero; + (void)scan_ptr; + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + zero = _mm_setzero_si128(); + if (!skip_block) { + __m128i eob; + __m128i zbin; + __m128i round, quant, dequant, shift; + { + __m128i coeff0, coeff1; + + // Setup global values + { + __m128i pw_1; + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + pw_1 = _mm_set1_epi16(1); + zbin = _mm_sub_epi16(zbin, pw_1); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + } + + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + __m128i cmp_mask0, cmp_mask1; + // Do DC and first 15 AC + coeff0 = load_coefficients(coeff_ptr + n_coeffs); + coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); + qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); + qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); + shift = _mm_unpackhi_epi64(shift, shift); + qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); + store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); + store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + + // AC only loop + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + __m128i cmp_mask0, cmp_mask1; + + coeff0 = load_coefficients(coeff_ptr + n_coeffs); + coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); + qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); + qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); + qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); + store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); + store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); + } + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } + } else { + do { + store_coefficients(zero, dqcoeff_ptr + n_coeffs); + store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); + store_coefficients(zero, qcoeff_ptr + n_coeffs); + store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); + n_coeffs += 8 * 2; + } while (n_coeffs < 0); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm new file mode 100644 index 000000000..36b4dddbd --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -0,0 +1,349 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +; TODO(yunqingwang)fix quantize_b code for skip=1 case. +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + psubw m0, [pw_1] + mov r2, shiftmp + mov r3, qcoeffmp + mova m4, [r2] ; m4 = shift + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob +%if CONFIG_HIGHBITDEPTH + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] +%else + lea coeffq, [ coeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] +%endif + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs +%if CONFIG_HIGHBITDEPTH + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] +%else + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: +%if CONFIG_HIGHBITDEPTH + ; pack coeff from 32bit to 16bit array + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] +%else + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + or r6, r2 + jz .skip_iter +%endif + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pxor m11, m11 + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: +%if CONFIG_HIGHBITDEPTH + mova [qcoeffq+ncoeffq*4+ 0], m5 + mova [qcoeffq+ncoeffq*4+16], m5 + mova [qcoeffq+ncoeffq*4+32], m5 + mova [qcoeffq+ncoeffq*4+48], m5 + mova [dqcoeffq+ncoeffq*4+ 0], m5 + mova [dqcoeffq+ncoeffq*4+16], m5 + mova [dqcoeffq+ncoeffq*4+32], m5 + mova [dqcoeffq+ncoeffq*4+48], m5 +%else + mova [qcoeffq+ncoeffq*2+ 0], m5 + mova [qcoeffq+ncoeffq*2+16], m5 + mova [dqcoeffq+ncoeffq*2+ 0], m5 + mova [dqcoeffq+ncoeffq*2+16], m5 +%endif + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob +%if CONFIG_HIGHBITDEPTH + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] +%else + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] +%endif + neg ncoeffq + pxor m7, m7 +.blank_loop: +%if CONFIG_HIGHBITDEPTH + mova [dqcoeffq+ncoeffq*4+ 0], m7 + mova [dqcoeffq+ncoeffq*4+16], m7 + mova [dqcoeffq+ncoeffq*4+32], m7 + mova [dqcoeffq+ncoeffq*4+48], m7 + mova [qcoeffq+ncoeffq*4+ 0], m7 + mova [qcoeffq+ncoeffq*4+16], m7 + mova [qcoeffq+ncoeffq*4+32], m7 + mova [qcoeffq+ncoeffq*4+48], m7 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m7 + mova [dqcoeffq+ncoeffq*2+16], m7 + mova [qcoeffq+ncoeffq*2+ 0], m7 + mova [qcoeffq+ncoeffq*2+16], m7 +%endif + add ncoeffq, mmsize + jl .blank_loop + mov word [eobq], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b, 7 +QUANTIZE_FN b_32x32, 7 diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c new file mode 100644 index 000000000..e60f518b4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include // AVX2 +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 32; i++) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m128i sum; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } + _mm256_zeroupper(); +} + +void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; + __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; + __m256i ref3_reg, ref3next_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 64; i++) { + // load 64 bytes from src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); + ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); + ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); + ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m128i sum; + + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } + _mm256_zeroupper(); +} + +void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += src_stride << 5; + rf[0] += ref_stride << 5; + rf[1] += ref_stride << 5; + rf[2] += ref_stride << 5; + rf[3] += ref_stride << 5; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} + +void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + unsigned int half_width = 32; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += half_width; + rf[0] += half_width; + rf[1] += half_width; + rf[2] += half_width; + rf[3] += half_width; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm new file mode 100644 index 000000000..8f04ef2f3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,253 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + movd m1, [srcq +%4] + movd m2, [ref1q+%5] + punpckldq m0, m1 + punpckldq m6, m2 + movd m1, [ref2q+%5] + movd m2, [ref3q+%5] + movd m3, [ref4q+%5] + punpckldq m4, m1 + punpckldq m7, m2 + punpckldq m5, m3 + movlhps m0, m0 + movlhps m6, m4 + movlhps m7, m5 + psadbw m6, m0 + psadbw m7, m0 +%else + movd m1, [ref1q+%3] + movd m5, [ref1q+%5] + movd m2, [ref2q+%3] + movd m4, [ref2q+%5] + punpckldq m1, m5 + punpckldq m2, m4 + movd m3, [ref3q+%3] + movd m5, [ref3q+%5] + punpckldq m3, m5 + movd m4, [ref4q+%3] + movd m5, [ref4q+%5] + punpckldq m4, m5 + movd m5, [srcq +%4] + punpckldq m0, m5 + movlhps m0, m0 + movlhps m1, m2 + movlhps m3, m4 + psadbw m1, m0 + psadbw m3, m0 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_128x2x4 5-6 0 + PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64 + PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6 +%endmacro + +; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 +%macro SADNXN4D 2 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if %1 > 4 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + pshufd m6, m6, 0x08 + pshufd m7, m7, 0x08 + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +%if CONFIG_EXT_PARTITION +SADNXN4D 128, 128 +SADNXN4D 128, 64 +SADNXN4D 64, 128 +%endif +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 +SADNXN4D 4, 8 +SADNXN4D 4, 4 diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c new file mode 100644 index 000000000..efba61289 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_avx2.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +#define FSAD64_H(h) \ + unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD32_H(h) \ + unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD64 \ + FSAD64_H(64); \ + FSAD64_H(32); + +#define FSAD32 \ + FSAD32_H(64); \ + FSAD32_H(32); \ + FSAD32_H(16); + +/* clang-format off */ +FSAD64 +FSAD32 +/* clang-format on */ + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H + +#define FSADAVG64_H(h) \ + unsigned int aom_sad64x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG32_H(h) \ + unsigned int aom_sad32x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG64 \ + FSADAVG64_H(64); \ + FSADAVG64_H(32); + +#define FSADAVG32 \ + FSADAVG32_H(64); \ + FSADAVG32_H(32); \ + FSADAVG32_H(16); + +/* clang-format off */ +FSADAVG64 +FSADAVG32 +/* clang-format on */ + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c new file mode 100644 index 000000000..196394379 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c @@ -0,0 +1,1043 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +// SAD +static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) { + // input 8 32-bit summation + __m128i lo128, hi128; + __m256i u = _mm256_srli_si256(*v, 8); + u = _mm256_add_epi32(u, *v); + + // 4 32-bit summation + hi128 = _mm256_extracti128_si256(u, 1); + lo128 = _mm256_castsi256_si128(u); + lo128 = _mm_add_epi32(hi128, lo128); + + // 2 32-bit summation + hi128 = _mm_srli_si128(lo128, 4); + lo128 = _mm_add_epi32(lo128, hi128); + + return (unsigned int)_mm_cvtsi128_si32(lo128); +} + +unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + + // first 4 rows + __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + __m256i u0 = _mm256_sub_epi16(s0, r0); + __m256i u1 = _mm256_sub_epi16(s1, r1); + __m256i u2 = _mm256_sub_epi16(s2, r2); + __m256i u3 = _mm256_sub_epi16(s3, r3); + __m256i zero = _mm256_setzero_si256(); + __m256i sum0, sum1; + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum0 = _mm256_add_epi16(u0, u1); + sum0 = _mm256_add_epi16(sum0, u2); + sum0 = _mm256_add_epi16(sum0, u3); + + // second 4 rows + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + u0 = _mm256_sub_epi16(s0, r0); + u1 = _mm256_sub_epi16(s1, r1); + u2 = _mm256_sub_epi16(s2, r2); + u3 = _mm256_sub_epi16(s3, r3); + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum1 = _mm256_add_epi16(u0, u1); + sum1 = _mm256_add_epi16(sum1, u2); + sum1 = _mm256_add_epi16(sum1, u3); + + // find out the SAD + s0 = _mm256_unpacklo_epi16(sum0, zero); + s1 = _mm256_unpackhi_epi16(sum0, zero); + r0 = _mm256_unpacklo_epi16(sum1, zero); + r1 = _mm256_unpackhi_epi16(sum1, zero); + s0 = _mm256_add_epi32(s0, s1); + r0 = _mm256_add_epi32(r0, r1); + sum0 = _mm256_add_epi32(s0, r0); + // 8 32-bit summation + + return (unsigned int)get_sad_from_mm256_epi32(&sum0); +} + +unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3; + __m256i sum0; + __m256i sum = _mm256_setzero_si256(); + const __m256i zero = _mm256_setzero_si256(); + int row = 0; + + // Loop for every 4 rows + while (row < 16) { + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + u0 = _mm256_sub_epi16(s0, r0); + u1 = _mm256_sub_epi16(s1, r1); + u2 = _mm256_sub_epi16(s2, r2); + u3 = _mm256_sub_epi16(s3, r3); + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum0 = _mm256_add_epi16(u0, u1); + sum0 = _mm256_add_epi16(sum0, u2); + sum0 = _mm256_add_epi16(sum0, u3); + + s0 = _mm256_unpacklo_epi16(sum0, zero); + s1 = _mm256_unpackhi_epi16(sum0, zero); + sum = _mm256_add_epi32(sum, s0); + sum = _mm256_add_epi32(sum, s1); + // 8 32-bit summation + + row += 4; + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + } + return get_sad_from_mm256_epi32(&sum); +} + +static void sad32x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s0, s1, s2, s3, r0, r1, r2, r3; + const __m256i zero = _mm256_setzero_si256(); + int row_sections = 0; + + while (row_sections < 2) { + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + + if (sec_ptr) { + r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); + r1 = _mm256_avg_epu16( + r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r2 = _mm256_avg_epu16( + r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r3 = _mm256_avg_epu16( + r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + s0 = _mm256_sub_epi16(s0, r0); + s1 = _mm256_sub_epi16(s1, r1); + s2 = _mm256_sub_epi16(s2, r2); + s3 = _mm256_sub_epi16(s3, r3); + + s0 = _mm256_abs_epi16(s0); + s1 = _mm256_abs_epi16(s1); + s2 = _mm256_abs_epi16(s2); + s3 = _mm256_abs_epi16(s3); + + s0 = _mm256_add_epi16(s0, s1); + s0 = _mm256_add_epi16(s0, s2); + s0 = _mm256_add_epi16(s0, s3); + + r0 = _mm256_unpacklo_epi16(s0, zero); + r1 = _mm256_unpackhi_epi16(s0, zero); + + r0 = _mm256_add_epi32(r0, r1); + *sad_acc = _mm256_add_epi32(*sad_acc, r0); + + row_sections += 1; + src_ptr += src_stride << 1; + ref_ptr += ref_stride << 1; + if (sec_ptr) sec_ptr += 32 << 1; + } +} + +unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 4; + ref += ref_stride << 4; + sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 4; + ref += ref_stride << 4; + sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 5; + ref += ref_stride << 5; + sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +static void sad64x2(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[8], r[8]; + const __m256i zero = _mm256_setzero_si256(); + + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32)); + s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32)); + r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + r[4] = _mm256_avg_epu16( + r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); + r[5] = _mm256_avg_epu16( + r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); + r[6] = _mm256_avg_epu16( + r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); + r[7] = _mm256_avg_epu16( + r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + } + + s[0] = _mm256_sub_epi16(s[0], r[0]); + s[1] = _mm256_sub_epi16(s[1], r[1]); + s[2] = _mm256_sub_epi16(s[2], r[2]); + s[3] = _mm256_sub_epi16(s[3], r[3]); + s[4] = _mm256_sub_epi16(s[4], r[4]); + s[5] = _mm256_sub_epi16(s[5], r[5]); + s[6] = _mm256_sub_epi16(s[6], r[6]); + s[7] = _mm256_sub_epi16(s[7], r[7]); + + s[0] = _mm256_abs_epi16(s[0]); + s[1] = _mm256_abs_epi16(s[1]); + s[2] = _mm256_abs_epi16(s[2]); + s[3] = _mm256_abs_epi16(s[3]); + s[4] = _mm256_abs_epi16(s[4]); + s[5] = _mm256_abs_epi16(s[5]); + s[6] = _mm256_abs_epi16(s[6]); + s[7] = _mm256_abs_epi16(s[7]); + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + s[4] = _mm256_add_epi16(s[4], s[5]); + s[4] = _mm256_add_epi16(s[4], s[6]); + s[4] = _mm256_add_epi16(s[4], s[7]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + r[2] = _mm256_unpacklo_epi16(s[4], zero); + r[3] = _mm256_unpackhi_epi16(s[4], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + r[0] = _mm256_add_epi32(r[0], r[2]); + r[0] = _mm256_add_epi32(r[0], r[3]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 5; + ref += ref_stride << 5; + sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +#if CONFIG_EXT_PARTITION +static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[8], r[8]; + const __m256i zero = _mm256_setzero_si256(); + + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64)); + s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80)); + s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96)); + s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64)); + r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80)); + r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96)); + r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + r[4] = _mm256_avg_epu16( + r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); + r[5] = _mm256_avg_epu16( + r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); + r[6] = _mm256_avg_epu16( + r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); + r[7] = _mm256_avg_epu16( + r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + } + + s[0] = _mm256_sub_epi16(s[0], r[0]); + s[1] = _mm256_sub_epi16(s[1], r[1]); + s[2] = _mm256_sub_epi16(s[2], r[2]); + s[3] = _mm256_sub_epi16(s[3], r[3]); + s[4] = _mm256_sub_epi16(s[4], r[4]); + s[5] = _mm256_sub_epi16(s[5], r[5]); + s[6] = _mm256_sub_epi16(s[6], r[6]); + s[7] = _mm256_sub_epi16(s[7], r[7]); + + s[0] = _mm256_abs_epi16(s[0]); + s[1] = _mm256_abs_epi16(s[1]); + s[2] = _mm256_abs_epi16(s[2]); + s[3] = _mm256_abs_epi16(s[3]); + s[4] = _mm256_abs_epi16(s[4]); + s[5] = _mm256_abs_epi16(s[5]); + s[6] = _mm256_abs_epi16(s[6]); + s[7] = _mm256_abs_epi16(s[7]); + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + s[4] = _mm256_add_epi16(s[4], s[5]); + s[4] = _mm256_add_epi16(s[4], s[6]); + s[4] = _mm256_add_epi16(s[4], s[7]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + r[2] = _mm256_unpacklo_epi16(s[4], zero); + r[3] = _mm256_unpackhi_epi16(s[4], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + r[0] = _mm256_add_epi32(r[0], r[2]); + r[0] = _mm256_add_epi32(r[0], r[3]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, NULL, &sad); + srcp += src_stride; + refp += ref_stride; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 6; + ref += ref_stride << 6; + sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 6; + ref += ref_stride << 6; + sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); + return sum; +} +#endif // CONFIG_EXT_PARTITION + +// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. +static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s0, s1, s2, s3, r0, r1, r2, r3; + const __m256i zero = _mm256_setzero_si256(); + + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + if (sec_ptr) { + r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); + r1 = _mm256_avg_epu16(r1, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r2 = _mm256_avg_epu16(r2, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r3 = _mm256_avg_epu16(r3, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + + s0 = _mm256_sub_epi16(s0, r0); + s1 = _mm256_sub_epi16(s1, r1); + s2 = _mm256_sub_epi16(s2, r2); + s3 = _mm256_sub_epi16(s3, r3); + + s0 = _mm256_abs_epi16(s0); + s1 = _mm256_abs_epi16(s1); + s2 = _mm256_abs_epi16(s2); + s3 = _mm256_abs_epi16(s3); + + s0 = _mm256_add_epi16(s0, s1); + s0 = _mm256_add_epi16(s0, s2); + s0 = _mm256_add_epi16(s0, s3); + + r0 = _mm256_unpacklo_epi16(s0, zero); + r1 = _mm256_unpackhi_epi16(s0, zero); + + r0 = _mm256_add_epi32(r0, r1); + *sad_acc = _mm256_add_epi32(*sad_acc, r0); +} + +unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + + // Next 4 rows + srcp += src_stride << 2; + refp += ref_stride << 2; + secp += 64; + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 3; + uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +#if CONFIG_EXT_PARTITION +unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 6; + uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, secp, &sad); + srcp += src_stride; + refp += ref_stride; + secp += 16 << 3; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + unsigned int sum; + const int left_shift = 6; + + sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 128 << left_shift; + sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} +#endif // CONFIG_EXT_PARTITION + +// SAD 4D +// Combine 4 __m256i vectors to uint32_t result[4] +static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, + uint32_t *res) { + __m256i u0, u1, u2, u3; + const __m256i mask = _mm256_set1_epi64x(UINT32_MAX); + __m128i sad; + + // 8 32-bit summation + u0 = _mm256_srli_si256(v[0], 4); + u1 = _mm256_srli_si256(v[1], 4); + u2 = _mm256_srli_si256(v[2], 4); + u3 = _mm256_srli_si256(v[3], 4); + + u0 = _mm256_add_epi32(u0, v[0]); + u1 = _mm256_add_epi32(u1, v[1]); + u2 = _mm256_add_epi32(u2, v[2]); + u3 = _mm256_add_epi32(u3, v[3]); + + u0 = _mm256_and_si256(u0, mask); + u1 = _mm256_and_si256(u1, mask); + u2 = _mm256_and_si256(u2, mask); + u3 = _mm256_and_si256(u3, mask); + // 4 32-bit summation, evenly positioned + + u1 = _mm256_slli_si256(u1, 4); + u3 = _mm256_slli_si256(u3, 4); + + u0 = _mm256_or_si256(u0, u1); + u2 = _mm256_or_si256(u2, u3); + // 8 32-bit summation, interleaved + + u1 = _mm256_unpacklo_epi64(u0, u2); + u3 = _mm256_unpackhi_epi64(u0, u2); + + u0 = _mm256_add_epi32(u1, u3); + sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), + _mm256_castsi256_si128(u0)); + _mm_storeu_si128((__m128i *)res, sad); +} + +static void convert_pointers(const uint8_t *const ref8[], + const uint16_t *ref[]) { + ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); + ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); + ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); + ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); +} + +static void init_sad(__m256i *s) { + s[0] = _mm256_setzero_si256(); + s[1] = _mm256_setzero_si256(); + s[2] = _mm256_setzero_si256(); + s[3] = _mm256_setzero_si256(); +} + +void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first8rows[4]; + uint32_t second8rows[4]; + const uint8_t *ref[4]; + const int shift_for_8_rows = 3; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows); + src += src_stride << shift_for_8_rows; + ref[0] += ref_stride << shift_for_8_rows; + ref[1] += ref_stride << shift_for_8_rows; + ref[2] += ref_stride << shift_for_8_rows; + ref[3] += ref_stride << shift_for_8_rows; + aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows); + sad_array[0] = first8rows[0] + second8rows[0]; + sad_array[1] = first8rows[1] + second8rows[1]; + sad_array[2] = first8rows[2] + second8rows[2]; + sad_array[3] = first8rows[3] + second8rows[3]; +} + +void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 4; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 4) { + sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 4; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 5; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_rows = 1; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 16) { + sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); + srcp += src_stride << shift_for_rows; + refp[i] += ref_stride << shift_for_rows; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 5; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +#if CONFIG_EXT_PARTITION +void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 6; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 64) { + sad128x1(srcp, refp[i], NULL, &sad_vec[i]); + srcp += src_stride; + refp[i] += ref_stride; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 6; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} +#endif // CONFIG_EXT_PARTITION diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c new file mode 100644 index 000000000..4419c65b2 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" + +static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + __m256i s1, s2, r1, r2; + __m256i sum = _mm256_setzero_si256(); + __m128i sum_i128; + int i; + + for (i = 0; i < 16; ++i) { + r1 = _mm256_loadu_si256((__m256i const *)ref_ptr); + r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr)); + s2 = _mm256_sad_epu8( + r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2)); + ref_ptr += ref_stride << 1; + src_ptr += src_stride << 1; + } + + sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8)); + sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1), + _mm256_castsi256_si128(sum)); + return _mm_cvtsi128_si32(sum_i128); +} + +static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 32; + uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 5; + ref_ptr += ref_stride << 5; + sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 64; + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static void sad64x64x4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + __m128i *res) { + uint32_t sum[4]; + aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum); + *res = _mm_loadu_si128((const __m128i *)sum); +} + +void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m128i sum0, sum1; + const uint8_t *rf[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); + src += src_stride << 6; + rf[0] += ref_stride << 6; + rf[1] += ref_stride << 6; + rf[2] += ref_stride << 6; + rf[3] += ref_stride << 6; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); + sum0 = _mm_add_epi32(sum0, sum1); + _mm_storeu_si128((__m128i *)res, sum0); +} + +void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m128i sum0, sum1; + unsigned int half_width = 64; + const uint8_t *rf[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); + src += half_width; + rf[0] += half_width; + rf[1] += half_width; + rf[2] += half_width; + rf[3] += half_width; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); + sum0 = _mm_add_epi32(sum0, sum1); + _mm_storeu_si128((__m128i *)res, sum0); +} + +void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += src_stride << 6; + rf[0] += ref_stride << 6; + rf[1] += ref_stride << 6; + rf[2] += ref_stride << 6; + rf[3] += ref_stride << 6; + aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} + +static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int h, const uint8_t *second_pred, + const int second_pred_stride) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + ref1_reg = _mm256_avg_epu8( + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); + ref2_reg = _mm256_avg_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + second_pred += second_pred_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + + return res; +} + +unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 64 << 6; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + return sum; +} + +unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + unsigned int half_width = 64; + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + src_ptr += half_width; + ref_ptr += half_width; + second_pred += half_width; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + return sum; +} + +unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, + ref_stride, second_pred); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 128 << 6; + sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, + second_pred); + return sum; +} diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm new file mode 100644 index 000000000..e45457a57 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -0,0 +1,345 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +%if CONFIG_EXT_PARTITION +; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD128XN 1-2 0 + SAD_FN 128, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*4] + pavgb m2, [second_predq+mmsize*5] + pavgb m3, [second_predq+mmsize*6] + pavgb m4, [second_predq+mmsize*7] + lea second_predq, [second_predq+mmsize*8] +%endif + psadbw m1, [srcq+64] + psadbw m2, [srcq+80] + psadbw m3, [srcq+96] + psadbw m4, [srcq+112] + + add refq, ref_strideq + add srcq, src_strideq + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + sub n_rowsd, 1 + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD128XN 128 ; sad128x128_sse2 +SAD128XN 128, 1 ; sad128x128_avg_sse2 +SAD128XN 64 ; sad128x64_sse2 +SAD128XN 64, 1 ; sad128x64_avg_sse2 +%endif + + +; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +%if CONFIG_EXT_PARTITION +SAD64XN 128 ; sad64x128_sse2 +SAD64XN 128, 1 ; sad64x128_avg_sse2 +%endif +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 + +; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 + +; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 + +; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 + +; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 + movlhps m1, m3 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + lea second_predq, [second_predq+mmsize*1] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m3, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m3 + movlhps m2, m4 + psadbw m1, m2 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm new file mode 100644 index 000000000..f6c27c855 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_sse3.asm @@ -0,0 +1,377 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBAOM_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBAOM_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +%macro PROCESS_16X2X3 5 +%if %1==0 + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm5, XMMWORD PTR [%3] + lddqu xmm6, XMMWORD PTR [%3+1] + lddqu xmm7, XMMWORD PTR [%3+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm1, XMMWORD PTR [%3] + lddqu xmm2, XMMWORD PTR [%3+1] + lddqu xmm3, XMMWORD PTR [%3+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [%2+%4] + lddqu xmm1, XMMWORD PTR [%3+%5] + lddqu xmm2, XMMWORD PTR [%3+%5+1] + lddqu xmm3, XMMWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_8X2X3 5 +%if %1==0 + movq mm0, QWORD PTR [%2] + movq mm5, QWORD PTR [%3] + movq mm6, QWORD PTR [%3+1] + movq mm7, QWORD PTR [%3+2] + + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, QWORD PTR [%2] + movq mm1, QWORD PTR [%3] + movq mm2, QWORD PTR [%3+1] + movq mm3, QWORD PTR [%3+2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endif + movq mm0, QWORD PTR [%2+%4] + movq mm1, QWORD PTR [%3+%5] + movq mm2, QWORD PTR [%3+%5+1] + movq mm3, QWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endmacro + +;void int aom_sad16x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x16x3_sse3) PRIVATE +sym(aom_sad16x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad16x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x8x3_sse3) PRIVATE +sym(aom_sad16x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad8x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad8x16x3_sse3) PRIVATE +sym(aom_sad8x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad8x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad8x8x3_sse3) PRIVATE +sym(aom_sad8x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad4x4x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad4x4x3_sse3) PRIVATE +sym(aom_sad4x4x3_sse3): + + STACK_FRAME_CREATE_X3 + + movd mm0, DWORD PTR [src_ptr] + movd mm1, DWORD PTR [ref_ptr] + + movd mm2, DWORD PTR [src_ptr+src_stride] + movd mm3, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, DWORD PTR [ref_ptr+1] + movd mm5, DWORD PTR [ref_ptr+2] + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm3, DWORD PTR [ref_ptr+ref_stride+2] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + psadbw mm4, mm0 + psadbw mm5, mm0 + + lea src_ptr, [src_ptr+src_stride*2] + lea ref_ptr, [ref_ptr+ref_stride*2] + + movd mm0, DWORD PTR [src_ptr] + movd mm2, DWORD PTR [ref_ptr] + + movd mm3, DWORD PTR [src_ptr+src_stride] + movd mm6, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm6 + + movd mm3, DWORD PTR [ref_ptr+1] + movd mm7, DWORD PTR [ref_ptr+2] + + psadbw mm2, mm0 + + paddw mm1, mm2 + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm6, DWORD PTR [ref_ptr+ref_stride+2] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm6 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + paddw mm3, mm4 + paddw mm7, mm5 + + mov rcx, result_ptr + + punpckldq mm1, mm3 + + movq [rcx], mm1 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm new file mode 100644 index 000000000..5e9c75845 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_sse4.asm @@ -0,0 +1,362 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X8 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm1, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm1, xmm2 + paddw xmm1, xmm3 + paddw xmm1, xmm4 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endif + movdqa xmm0, XMMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + movq xmm2, MMWORD PTR [rdi+ rdx+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_8X2X8 1 +%if %1 + movq xmm0, MMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm1, xmm2 +%else + movq xmm0, MMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endif + movq xmm0, MMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_4X2X8 1 +%if %1 + movd xmm0, [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + mpsadbw xmm1, xmm0, 0x0 +%else + movd xmm0, [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endif + movd xmm0, [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endmacro + +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro + +;void aom_sad16x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array); +global sym(aom_sad16x16x8_sse4_1) PRIVATE +sym(aom_sad16x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad16x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad16x8x8_sse4_1) PRIVATE +sym(aom_sad16x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad8x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad8x8x8_sse4_1) PRIVATE +sym(aom_sad8x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad8x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad8x16x8_sse4_1) PRIVATE +sym(aom_sad8x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad4x4x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad4x4x8_sse4_1) PRIVATE +sym(aom_sad4x4x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + + diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm new file mode 100644 index 000000000..96b64b040 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_ssse3.asm @@ -0,0 +1,373 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm5, XMMWORD PTR [rdi] + lddqu xmm6, XMMWORD PTR [rdi+1] + lddqu xmm7, XMMWORD PTR [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm1, XMMWORD PTR [rdi] + lddqu xmm2, XMMWORD PTR [rdi+1] + lddqu xmm3, XMMWORD PTR [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + lddqu xmm1, XMMWORD PTR [rdi+rdx] + lddqu xmm2, XMMWORD PTR [rdi+rdx+1] + lddqu xmm3, XMMWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X2X3_OFFSET 2 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm7, XMMWORD PTR [rdi+16] + + movdqa xmm5, xmm7 + palignr xmm5, xmm4, %2 + + movdqa xmm6, xmm7 + palignr xmm6, xmm4, (%2+1) + + palignr xmm7, xmm4, (%2+2) + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm3, XMMWORD PTR [rdi+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + movdqa xmm4, XMMWORD PTR [rdi+rdx] + movdqa xmm3, XMMWORD PTR [rdi+rdx+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X16X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +%macro PROCESS_16X8X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +;void int aom_sad16x16x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x16x3_ssse3) PRIVATE +sym(aom_sad16x16x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .aom_sad16x16x3_ssse3_skiptable +.aom_sad16x16x3_ssse3_jumptable: + dd .aom_sad16x16x3_ssse3_aligned_by_0 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_1 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_2 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_3 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_4 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_5 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_6 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_7 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_8 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_9 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump +.aom_sad16x16x3_ssse3_skiptable: + + call .aom_sad16x16x3_ssse3_do_jump +.aom_sad16x16x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump + add rax, rcx ; get the absolute address of aom_sad16x16x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X16X3_OFFSET 0, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3 + +.aom_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.aom_sad16x16x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void int aom_sad16x8x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x8x3_ssse3) PRIVATE +sym(aom_sad16x8x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .aom_sad16x8x3_ssse3_skiptable +.aom_sad16x8x3_ssse3_jumptable: + dd .aom_sad16x8x3_ssse3_aligned_by_0 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_1 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_2 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_3 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_4 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_5 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_6 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_7 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_8 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_9 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump +.aom_sad16x8x3_ssse3_skiptable: + + call .aom_sad16x8x3_ssse3_do_jump +.aom_sad16x8x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump + add rax, rcx ; get the absolute address of aom_sad16x8x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X8X3_OFFSET 0, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3 + +.aom_sad16x8x3_ssse3_aligned_by_15: + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.aom_sad16x8x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm new file mode 100644 index 000000000..aa70106c8 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm @@ -0,0 +1,219 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_16x16_sse2) PRIVATE +sym(aom_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_8x8_sse2) PRIVATE +sym(aom_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm new file mode 100644 index 000000000..d3feb7ec0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm @@ -0,0 +1,1489 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define block_height dword heightm + %define sec_str sec_stridemp + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 + %define block_height heightd + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define block_height heightd + %define sec_str sec_strideq + %else + %define block_height dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [dstq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [dstq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [dstq] + paddw m4, m3 + movx m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [dstq+dst_strideq] + movx m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [dstq+dst_strideq] + paddw m2, m1 + movx m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM sse2 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm new file mode 100644 index 000000000..7bd5b23ad --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm @@ -0,0 +1,150 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void aom_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 +%if CONFIG_EXT_PARTITION + cmp colsd, 64 + je .case_64 +%endif + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + +%if CONFIG_EXT_PARTITION + mov pred_str, pred_stridemp +.loop_128: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize + loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + sub rowsd, 1 + jnz .loop_128 + RET + +.case_64: +%endif + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + RET diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c new file mode 100644 index 000000000..6be99fbca --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "./aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, + int stride) { + const __m128i v_val_0_w = + _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + const __m128i v_val_1_w = + _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); + const __m128i v_val_2_w = + _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + const __m128i v_val_3_w = + _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + + const __m128i v_sum_d = + _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); + + return (uint64_t)_mm_cvtsi128_si32(v_sum_d); +} + +#ifdef __GNUC__ +// This prevents GCC/Clang from inlining this function into +// aom_sum_squares_2d_i16_sse2, which in turn saves some stack +// maintenance instructions in the common case of 4x4. +__attribute__((noinline)) +#endif +static uint64_t +aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, + int height) { + int r, c; + + const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + __m128i v_acc_q = _mm_setzero_si128(); + + for (r = 0; r < height; r += 8) { + __m128i v_acc_d = _mm_setzero_si128(); + + for (c = 0; c < width; c += 8) { + const int16_t *b = src + c; + + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); + } + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 8 * stride; + } + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); + return tmp; + } +#endif +} + +uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, + int height) { + // 4 elements per row only requires half an XMM register, so this + // must be a special case, but also note that over 75% of all calls + // are with size == 4, so it is also the common case. + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) { + // Generic case + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +////////////////////////////////////////////////////////////////////////////// +// 1D version +////////////////////////////////////////////////////////////////////////////// + +static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { + const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + __m128i v_acc0_q = _mm_setzero_si128(); + __m128i v_acc1_q = _mm_setzero_si128(); + + const int16_t *const end = src + n; + + assert(n % 64 == 0); + + while (src < end) { + const __m128i v_val_0_w = xx_load_128(src); + const __m128i v_val_1_w = xx_load_128(src + 8); + const __m128i v_val_2_w = xx_load_128(src + 16); + const __m128i v_val_3_w = xx_load_128(src + 24); + const __m128i v_val_4_w = xx_load_128(src + 32); + const __m128i v_val_5_w = xx_load_128(src + 40); + const __m128i v_val_6_w = xx_load_128(src + 48); + const __m128i v_val_7_w = xx_load_128(src + 56); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); + v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); + + src += 64; + } + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); + return tmp; + } +#endif +} + +uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { + if (n % 64 == 0) { + return aom_sum_squares_i16_64n_sse2(src, n); + } else if (n > 64) { + int k = n & ~(64 - 1); + return aom_sum_squares_i16_64n_sse2(src, k) + + aom_sum_squares_i16_c(src + k, n - k); + } else { + return aom_sum_squares_i16_c(src, n); + } +} diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h new file mode 100644 index 000000000..bef606dae --- /dev/null +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SYNONYMS_H_ +#define AOM_DSP_X86_SYNONYMS_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m128i xx_loadl_32(const void *a) { + return _mm_cvtsi32_si128(*(const uint32_t *)a); +} + +static INLINE __m128i xx_loadl_64(const void *a) { + return _mm_loadl_epi64((const __m128i *)a); +} + +static INLINE __m128i xx_load_128(const void *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i xx_loadu_128(const void *a) { + return _mm_loadu_si128((const __m128i *)a); +} + +static INLINE void xx_storel_32(void *const a, const __m128i v) { + *(uint32_t *)a = _mm_cvtsi128_si32(v); +} + +static INLINE void xx_storel_64(void *const a, const __m128i v) { + _mm_storel_epi64((__m128i *)a, v); +} + +static INLINE void xx_store_128(void *const a, const __m128i v) { + _mm_store_si128((__m128i *)a, v); +} + +static INLINE void xx_storeu_128(void *const a, const __m128i v) { + _mm_storeu_si128((__m128i *)a, v); +} + +static INLINE __m128i xx_round_epu16(__m128i v_val_w) { + return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { + const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); + return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srli_epi32(v_tmp_d, bits); +} + +// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#ifdef __SSSE3__ +static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} +#endif // __SSSE3__ + +#endif // AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 000000000..39e9b8e2a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H +#define AOM_DSP_X86_TXFM_COMMON_AVX2_H + +#include + +#include "aom_dsp/txfm_common.h" + +#define pair256_set_epi16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +#define pair256_set_epi32(a, b) \ + _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ + (int)(b), (int)(a)) + +static INLINE void mm256_reverse_epi16(__m256i *u) { + const __m256i control = _mm256_set_epi16( + 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100, + 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E); + __m256i v = _mm256_shuffle_epi8(*u, control); + *u = _mm256_permute2x128_si256(v, v, 1); +} + +static INLINE void mm256_transpose_16x16(__m256i *in) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} + +static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) { + const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i y0 = _mm256_madd_epi16(a0, cospi); + __m256i y1 = _mm256_madd_epi16(a1, cospi); + + y0 = _mm256_add_epi32(y0, dct_rounding); + y1 = _mm256_add_epi32(y1, dct_rounding); + y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS); + y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS); + + return _mm256_packs_epi32(y0, y1); +} + +static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i sqrt2_epi16 = _mm256_set1_epi16(c); + const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i u0, u1; + int i = 0; + + while (i < 16) { + in[i] = _mm256_slli_epi16(in[i], 1); + + u0 = _mm256_unpacklo_epi16(zero, in[i]); + u1 = _mm256_unpackhi_epi16(zero, in[i]); + + u0 = _mm256_madd_epi16(u0, sqrt2_epi16); + u1 = _mm256_madd_epi16(u1, sqrt2_epi16); + + u0 = _mm256_add_epi32(u0, dct_const_rounding); + u1 = _mm256_add_epi32(u1, dct_const_rounding); + + u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); + u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); + in[i] = _mm256_packs_epi32(u0, u1); + i++; + } +} + +#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h new file mode 100644 index 000000000..e4ac56339 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { +#if CONFIG_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); +#else + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); +#endif // CONFIG_HIGHBITDEPTH +} + +#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h new file mode 100644 index 000000000..4257d8b9c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_ + +#include +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#define pair_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +#define dual_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ + (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) + +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ + _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ + (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) + +// Reverse the 8 16 bit words in __m128i +static INLINE __m128i mm_reverse_epi16(const __m128i x) { + const __m128i a = _mm_shufflelo_epi16(x, 0x1b); + const __m128i b = _mm_shufflehi_epi16(a, 0x1b); + return _mm_shuffle_epi32(b, 0x4e); +} + +#if CONFIG_EXT_TX +// Identity transform (both forward and inverse). +static INLINE void idtx16_8col(__m128i *in) { + const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); + const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i y0, y1, y2, y3, y4, y5, y6, y7; + + in[0] = _mm_slli_epi16(in[0], 1); + in[1] = _mm_slli_epi16(in[1], 1); + in[2] = _mm_slli_epi16(in[2], 1); + in[3] = _mm_slli_epi16(in[3], 1); + in[4] = _mm_slli_epi16(in[4], 1); + in[5] = _mm_slli_epi16(in[5], 1); + in[6] = _mm_slli_epi16(in[6], 1); + in[7] = _mm_slli_epi16(in[7], 1); + in[8] = _mm_slli_epi16(in[8], 1); + in[9] = _mm_slli_epi16(in[9], 1); + in[10] = _mm_slli_epi16(in[10], 1); + in[11] = _mm_slli_epi16(in[11], 1); + in[12] = _mm_slli_epi16(in[12], 1); + in[13] = _mm_slli_epi16(in[13], 1); + in[14] = _mm_slli_epi16(in[14], 1); + in[15] = _mm_slli_epi16(in[15], 1); + + v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); + v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); + v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); + v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); + v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16); + v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16); + v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16); + v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16); + + u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16); + u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16); + u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16); + u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16); + u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16); + u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16); + u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16); + u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16); + + x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16); + x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16); + x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16); + x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16); + x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16); + x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16); + x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16); + x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16); + + y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16); + y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16); + y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16); + y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16); + y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16); + y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16); + y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16); + y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16); + + v0 = _mm_madd_epi16(v0, k__sqrt2_epi16); + v1 = _mm_madd_epi16(v1, k__sqrt2_epi16); + v2 = _mm_madd_epi16(v2, k__sqrt2_epi16); + v3 = _mm_madd_epi16(v3, k__sqrt2_epi16); + v4 = _mm_madd_epi16(v4, k__sqrt2_epi16); + v5 = _mm_madd_epi16(v5, k__sqrt2_epi16); + v6 = _mm_madd_epi16(v6, k__sqrt2_epi16); + v7 = _mm_madd_epi16(v7, k__sqrt2_epi16); + + x0 = _mm_madd_epi16(x0, k__sqrt2_epi16); + x1 = _mm_madd_epi16(x1, k__sqrt2_epi16); + x2 = _mm_madd_epi16(x2, k__sqrt2_epi16); + x3 = _mm_madd_epi16(x3, k__sqrt2_epi16); + x4 = _mm_madd_epi16(x4, k__sqrt2_epi16); + x5 = _mm_madd_epi16(x5, k__sqrt2_epi16); + x6 = _mm_madd_epi16(x6, k__sqrt2_epi16); + x7 = _mm_madd_epi16(x7, k__sqrt2_epi16); + + u0 = _mm_madd_epi16(u0, k__sqrt2_epi16); + u1 = _mm_madd_epi16(u1, k__sqrt2_epi16); + u2 = _mm_madd_epi16(u2, k__sqrt2_epi16); + u3 = _mm_madd_epi16(u3, k__sqrt2_epi16); + u4 = _mm_madd_epi16(u4, k__sqrt2_epi16); + u5 = _mm_madd_epi16(u5, k__sqrt2_epi16); + u6 = _mm_madd_epi16(u6, k__sqrt2_epi16); + u7 = _mm_madd_epi16(u7, k__sqrt2_epi16); + + y0 = _mm_madd_epi16(y0, k__sqrt2_epi16); + y1 = _mm_madd_epi16(y1, k__sqrt2_epi16); + y2 = _mm_madd_epi16(y2, k__sqrt2_epi16); + y3 = _mm_madd_epi16(y3, k__sqrt2_epi16); + y4 = _mm_madd_epi16(y4, k__sqrt2_epi16); + y5 = _mm_madd_epi16(y5, k__sqrt2_epi16); + y6 = _mm_madd_epi16(y6, k__sqrt2_epi16); + y7 = _mm_madd_epi16(y7, k__sqrt2_epi16); + + v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING); + x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING); + x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING); + x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING); + x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING); + x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING); + x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING); + x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING); + + u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING); + y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING); + y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING); + y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING); + y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING); + y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING); + y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING); + y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + x0 = _mm_srai_epi32(x0, DCT_CONST_BITS); + x1 = _mm_srai_epi32(x1, DCT_CONST_BITS); + x2 = _mm_srai_epi32(x2, DCT_CONST_BITS); + x3 = _mm_srai_epi32(x3, DCT_CONST_BITS); + x4 = _mm_srai_epi32(x4, DCT_CONST_BITS); + x5 = _mm_srai_epi32(x5, DCT_CONST_BITS); + x6 = _mm_srai_epi32(x6, DCT_CONST_BITS); + x7 = _mm_srai_epi32(x7, DCT_CONST_BITS); + + u0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + y0 = _mm_srai_epi32(y0, DCT_CONST_BITS); + y1 = _mm_srai_epi32(y1, DCT_CONST_BITS); + y2 = _mm_srai_epi32(y2, DCT_CONST_BITS); + y3 = _mm_srai_epi32(y3, DCT_CONST_BITS); + y4 = _mm_srai_epi32(y4, DCT_CONST_BITS); + y5 = _mm_srai_epi32(y5, DCT_CONST_BITS); + y6 = _mm_srai_epi32(y6, DCT_CONST_BITS); + y7 = _mm_srai_epi32(y7, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(v0, x0); + in[1] = _mm_packs_epi32(v1, x1); + in[2] = _mm_packs_epi32(v2, x2); + in[3] = _mm_packs_epi32(v3, x3); + in[4] = _mm_packs_epi32(v4, x4); + in[5] = _mm_packs_epi32(v5, x5); + in[6] = _mm_packs_epi32(v6, x6); + in[7] = _mm_packs_epi32(v7, x7); + + in[8] = _mm_packs_epi32(u0, y0); + in[9] = _mm_packs_epi32(u1, y1); + in[10] = _mm_packs_epi32(u2, y2); + in[11] = _mm_packs_epi32(u3, y3); + in[12] = _mm_packs_epi32(u4, y4); + in[13] = _mm_packs_epi32(u5, y5); + in[14] = _mm_packs_epi32(u6, y6); + in[15] = _mm_packs_epi32(u7, y7); +} +#endif // CONFIG_EXT_TX + +static INLINE void scale_sqrt2_8x4(__m128i *in) { + // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32 + // consecutive elements. + const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); + + const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); + const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); + const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); + const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); + const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); + const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); + const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); + const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); + + const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); + + in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); + in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); + in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); + in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); +} + +static INLINE void scale_sqrt2_8x8(__m128i *in) { + // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)' + // for each element. + const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); + + const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); + const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); + const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); + const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); + const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); + const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); + const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); + const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); + const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w); + const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w); + const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w); + const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w); + const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w); + const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w); + const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w); + const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w); + + const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w); + const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w); + const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w); + const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w); + const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w); + const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w); + const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w); + const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w); + + in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); + in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); + in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); + in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); + in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS)); + in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS)); + in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS)); + in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS)); +} + +static INLINE void scale_sqrt2_8x16(__m128i *in) { + scale_sqrt2_8x8(in); + scale_sqrt2_8x8(in + 8); +} + +#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c new file mode 100644 index 000000000..18a70dffe --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void aom_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse, + int *sum); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, int h, + unsigned int *sse, int *sum, get_var_avx2 var_fn, + int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_get16x16var_avx2, 16); + + variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + _mm256_zeroupper(); + return *sse; +} + +unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse); + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sseptr); + +unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src, + int src_stride, int x_offset, + int y_offset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + unsigned int sse1; + const int se1 = aom_sub_pixel_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); + unsigned int sse2; + const int se2 = + aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, + dst + 32, dst_stride, 64, &sse2); + const int se = se1 + se2; + unsigned int variance; + *sse = sse1 + sse2; + + variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src, + int src_stride, int x_offset, + int y_offset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + const int se = aom_sub_pixel_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); + + const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_avg_variance64x64_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + unsigned int sse1; + const int se1 = aom_sub_pixel_avg_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); + unsigned int sse2; + const int se2 = aom_sub_pixel_avg_variance32xh_avx2( + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, + 64, 64, &sse2); + const int se = se1 + se2; + unsigned int variance; + + *sse = sse1 + sse2; + + variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_avg_variance32x32_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + // Process 32 elements in parallel. + const int se = aom_sub_pixel_avg_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); + + const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); + _mm256_zeroupper(); + return variance; +} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c new file mode 100644 index 000000000..999b541e3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c @@ -0,0 +1,713 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; +/* clang-format on */ + +void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *SSE, int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i, src_2strides, ref_2strides; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing two strides in a 256 bit register reducing the number + // of loop stride by half (comparing to the sse2 code) + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); + src = _mm256_inserti128_si256( + src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); + + ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); + ref = _mm256_inserti128_si256( + ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = + _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); + + src_ptr += src_2strides; + ref_ptr += ref_2strides; + } + + { + __m128i sum_res, madd_res; + __m128i expand_sum_low, expand_sum_high, expand_sum; + __m128i expand_madd_low, expand_madd_high, expand_madd; + __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // extract the low lane and add it to the high lane + sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), + _mm256_extractf128_si256(sum_ref_src, 1)); + + madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), + _mm256_extractf128_si256(madd_ref_src, 1)); + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = + _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); + expand_sum_high = + _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); + + // shifting the sign 16 bits right + expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = + _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); + expand_madd_high = + _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); + + expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = + _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); + ex_expand_sum_high = + _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); + + ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_res = _mm_srli_si128(expand_madd, 8); + sum_res = _mm_srli_si128(ex_expand_sum, 8); + + madd_res = _mm_add_epi32(madd_res, expand_madd); + sum_res = _mm_add_epi32(sum_res, ex_expand_sum); + + *((int *)SSE) = _mm_cvtsi128_si32(madd_res); + + *((int *)Sum) = _mm_cvtsi128_si32(sum_res); + } + _mm256_zeroupper(); +} + +void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *SSE, int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing 32 elements in parallel + for (i = 0; i < 16; i++) { + src = _mm256_loadu_si256((__m256i const *)(src_ptr)); + + ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = + _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); + + src_ptr += source_stride; + ref_ptr += recon_stride; + } + + { + __m256i expand_sum_low, expand_sum_high, expand_sum; + __m256i expand_madd_low, expand_madd_high, expand_madd; + __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); + expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); + + // shifting the sign 16 bits right + expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); + expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); + + expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); + ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); + + ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_ref_src = _mm256_srli_si256(expand_madd, 8); + sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); + + madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); + sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); + + // extract the low lane and the high lane and add the results + *((int *)SSE) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); + + *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); + } + _mm256_zeroupper(); +} + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define MERGE_WITH_SRC(src_reg, reg) \ + exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ + exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); + +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); + +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sse) { + __m256i sec_reg; + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + MERGE_WITH_SRC(src_reg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c new file mode 100644 index 000000000..d9563aa7f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int aom_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8( \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +static void get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = + _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); +} + +void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +void aom_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse, + int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = + (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, int w, + int h, unsigned int *sse, int *sum, + getNxMvar_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + assert(sum <= 255 * 4 * 4); + assert(sum >= -255 * 4 * 4); + return *sse - ((sum * sum) >> 4); +} + +unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, + get4x4var_sse2, 4); + assert(sum <= 255 * 8 * 4); + assert(sum >= -255 * 8 * 4); + return *sse - ((sum * sum) >> 5); +} + +unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, + get4x4var_sse2, 4); + assert(sum <= 255 * 8 * 4); + assert(sum >= -255 * 8 * 4); + return *sse - ((sum * sum) >> 5); +} + +unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + assert(sum <= 255 * 8 * 8); + assert(sum >= -255 * 8 * 8); + return *sse - ((sum * sum) >> 6); +} + +unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, + aom_get8x8var_sse2, 8); + assert(sum <= 255 * 16 * 8); + assert(sum >= -255 * 16 * 8); + return *sse - ((sum * sum) >> 7); +} + +unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, + aom_get8x8var_sse2, 8); + assert(sum <= 255 * 16 * 8); + assert(sum >= -255 * 16 * 8); + return *sse - ((sum * sum) >> 7); +} + +unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + assert(sum <= 255 * 16 * 16); + assert(sum >= -255 * 16 * 16); + return *sse - ((uint32_t)((int64_t)sum * sum) >> 8); +} + +unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 32 * 32); + assert(sum >= -255 * 32 * 32); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 32 * 16); + assert(sum >= -255 * 32 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 32 * 16); + assert(sum >= -255 * 32 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 64); + assert(sum >= -255 * 64 * 64); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); +} + +unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 32); + assert(sum >= -255 * 64 * 32); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); +} + +unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 32); + assert(sum >= -255 * 64 * 32); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); +} + +unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int aom_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt1, opt2) \ + DECL(4, opt1); \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse2); +DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + unsigned int sse; \ + int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + h, &sse, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) + +FNS(sse2, sse2); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused) +#define DECLS(opt1, opt2) \ + DECL(4, opt1); \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse2); +DECLS(ssse3, ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ + const uint8_t *sec) { \ + unsigned int sse; \ + int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sseptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + +void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi64(s0, s4); + + _mm_storeu_si128((__m128i *)(comp_pred), s0); + comp_pred += 16; + ref += 16 * 8; + } + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); + + _mm_storel_epi64((__m128i *)(comp_pred), s0); + comp_pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + + *(int *)comp_pred = _mm_cvtsi128_si32(s0); + comp_pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} + +void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i p1; + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi8(s0, zero); + s4 = _mm_unpacklo_epi8(s4, zero); + + p1 = _mm_unpackhi_epi8(p0, zero); + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p1 = _mm_adds_epu16(s4, p1); + p0 = _mm_adds_epu16(p0, one); + p1 = _mm_adds_epu16(p1, one); + + p0 = _mm_srli_epi16(p0, 1); + p1 = _mm_srli_epi16(p1, 1); + p0 = _mm_packus_epi16(p0, p1); + + _mm_storeu_si128((__m128i *)(comp_pred), p0); + comp_pred += 16; + pred += 16; + ref += 16 * 8; + } + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + _mm_storel_epi64((__m128i *)(comp_pred), p0); + comp_pred += 8; + pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + *(int *)comp_pred = _mm_cvtsi128_si32(p0); + comp_pred += 4; + pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} -- cgit v1.2.3